Skip to main content
⚡ Calmops

Web Scraping with Rust

Extract data from websites with reqwest, scraper, and advanced techniques

Web scraping—programmatically extracting data from websites—is invaluable for data collection, price monitoring, research, and competitive analysis. Rust’s performance, memory safety, and excellent HTTP libraries make it ideal for building scraping tools. This article covers techniques from basic HTML parsing to handling JavaScript-rendered content.


Why Rust for Web Scraping?

Compared to Python or JavaScript:

  • Speed: 100-1000x faster (Rust compiled vs interpreted)
  • Memory efficiency: Rust’s ownership model prevents memory leaks and bloat
  • Concurrency: Handle thousands of concurrent requests safely
  • Type safety: Catch parsing errors at compile time
  • Single binary: Distribute without runtime dependencies

Setup and Dependencies

[dependencies]
reqwest = { version = "0.11", features = ["json"] }
tokio = { version = "1", features = ["full"] }
scraper = "0.17"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
regex = "1"
url = "2"
chrono = "0.4"

Basic HTML Parsing

Simple GET Request

// filepath: src/basic_scraping.rs
use reqwest::Client;
use scraper::Html;

#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let client = Client::new();
    
    // Fetch a webpage
    let response = client
        .get("https://example.com")
        .header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64)")
        .send()
        .await?;
    
    let body = response.text().await?;
    
    // Parse HTML
    let document = Html::parse_document(&body);
    
    // Select elements
    let selector = scraper::Selector::parse("h1").unwrap();
    
    for element in document.select(&selector) {
        println!("Title: {}", element.inner_html());
    }
    
    Ok(())
}

CSS Selector Extraction

use scraper::{Html, Selector};
use serde::Deserialize;

#[derive(Debug, Deserialize)]
pub struct Article {
    pub title: String,
    pub url: String,
    pub date: String,
    pub author: String,
}

pub fn scrape_articles(html: &str) -> Result<Vec<Article>, Box<dyn std::error::Error>> {
    let document = Html::parse_document(html);
    
    // More complex selectors
    let article_selector = Selector::parse("article.post").unwrap();
    let title_selector = Selector::parse("h2.title").unwrap();
    let url_selector = Selector::parse("a.permalink").unwrap();
    let date_selector = Selector::parse("time").unwrap();
    let author_selector = Selector::parse(".author").unwrap();
    
    let mut articles = Vec::new();
    
    for article_elem in document.select(&article_selector) {
        let title = article_elem
            .select(&title_selector)
            .next()
            .and_then(|elem| elem.select(&title_selector).next())
            .map(|elem| elem.inner_html())
            .unwrap_or_default();
        
        let url = article_elem
            .select(&url_selector)
            .next()
            .and_then(|elem| elem.value().attr("href"))
            .unwrap_or_default()
            .to_string();
        
        let date = article_elem
            .select(&date_selector)
            .next()
            .and_then(|elem| elem.value().attr("datetime"))
            .unwrap_or_default()
            .to_string();
        
        let author = article_elem
            .select(&author_selector)
            .next()
            .map(|elem| elem.inner_html())
            .unwrap_or_default();
        
        articles.push(Article {
            title,
            url,
            date,
            author,
        });
    }
    
    Ok(articles)
}

Concurrent Scraping

Parallel Requests

// filepath: src/concurrent_scraping.rs
use reqwest::Client;
use tokio::task;
use futures::stream::{self, StreamExt};

pub async fn scrape_multiple_urls(
    urls: Vec<&str>,
    concurrency: usize,
) -> Result<Vec<String>, Box<dyn std::error::Error>> {
    let client = Client::new();
    
    let results = stream::iter(urls)
        .map(|url| {
            let client = client.clone();
            async move {
                client
                    .get(url)
                    .send()
                    .await
                    .ok()?
                    .text()
                    .await
                    .ok()
            }
        })
        .buffered(concurrency)
        .collect::<Vec<_>>()
        .await;
    
    Ok(results.into_iter().flatten().collect())
}

#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let urls = vec![
        "https://example.com/page1",
        "https://example.com/page2",
        "https://example.com/page3",
    ];
    
    // Limit to 5 concurrent requests
    let results = scrape_multiple_urls(urls, 5).await?;
    
    println!("Scraped {} pages", results.len());
    Ok(())
}

Rate Limiting

use std::time::Duration;
use tokio::time::sleep;
use async_rate_limit::AsyncRateLimit;

pub async fn scrape_with_rate_limit(
    urls: Vec<&str>,
    requests_per_second: u32,
) -> Result<Vec<String>, Box<dyn std::error::Error>> {
    let client = reqwest::Client::new();
    
    // Create rate limiter: N requests per second
    let limiter = AsyncRateLimit::new(requests_per_second as usize, 
                                      Duration::from_secs(1));
    
    let mut results = Vec::new();
    
    for url in urls {
        // Wait for rate limit
        limiter.acquire_one().await;
        
        let response = client.get(url).send().await?;
        results.push(response.text().await?);
        
        println!("Fetched: {}", url);
    }
    
    Ok(results)
}

// Alternative: Manual rate limiting
pub async fn scrape_with_delay(
    urls: Vec<&str>,
    delay_ms: u64,
) -> Result<Vec<String>, Box<dyn std::error::Error>> {
    let client = reqwest::Client::new();
    let mut results = Vec::new();
    
    for url in urls {
        let response = client.get(url).send().await?;
        results.push(response.text().await?);
        
        // Wait between requests
        sleep(Duration::from_millis(delay_ms)).await;
    }
    
    Ok(results)
}

Handling JavaScript-Rendered Content

Using Headless Browser (Chromium)

For JavaScript-heavy sites, you need a browser engine:

// filepath: src/js_rendering.rs
use headless_chrome::Browser;
use headless_chrome::protocol::cdp::Page;

pub fn scrape_javascript_content(url: &str) -> Result<String, Box<dyn std::error::Error>> {
    // Launch headless Chrome
    let browser = Browser::default()?;
    let tab = browser.wait_for_initial_tab()?;
    
    // Navigate to URL
    tab.navigate_to(url)?;
    
    // Wait for element to appear
    tab.wait_for_element("body", std::time::Duration::from_secs(5))?;
    
    // Get rendered HTML
    let html = tab.get_content()?;
    
    Ok(html)
}

#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let html = scrape_javascript_content("https://example.com")?;
    println!("Rendered HTML:\n{}", html);
    Ok(())
}

Add to Cargo.toml:

headless_chrome = "1.0"

Puppeteer-style API with Chromium

use std::time::Duration;

pub async fn scrape_with_actions(url: &str) -> Result<String, Box<dyn std::error::Error>> {
    // More advanced: interact with page
    let browser = headless_chrome::Browser::default()?;
    let tab = browser.wait_for_initial_tab()?;
    
    tab.navigate_to(url)?;
    
    // Wait for dynamic content
    tab.wait_for_element(".dynamic-content", Duration::from_secs(10))?;
    
    // Optionally: click elements, fill forms, etc.
    // tab.click_element("button.load-more")?;
    // tab.wait_for_element(".new-content", Duration::from_secs(5))?;
    
    let html = tab.get_content()?;
    Ok(html)
}

Data Extraction Patterns

Structured Data Extraction

// filepath: src/data_extraction.rs
use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use regex::Regex;

#[derive(Debug, Serialize, Deserialize)]
pub struct Product {
    pub id: String,
    pub name: String,
    pub price: f64,
    pub rating: f64,
    pub in_stock: bool,
}

pub fn extract_products(html: &str) -> Result<Vec<Product>, Box<dyn std::error::Error>> {
    let document = Html::parse_document(html);
    let product_selector = Selector::parse("div.product-item")?;
    let name_selector = Selector::parse("h3.product-name")?;
    let price_selector = Selector::parse("span.price")?;
    let rating_selector = Selector::parse("div.rating")?;
    let stock_selector = Selector::parse("span.stock-status")?;
    
    let id_regex = Regex::new(r"product-(\d+)")?;
    let price_regex = Regex::new(r"\$([0-9.]+)")?;
    let rating_regex = Regex::new(r"(\d+\.?\d*)/5")?;
    
    let mut products = Vec::new();
    
    for product_elem in document.select(&product_selector) {
        // Extract ID from class or attribute
        let id = product_elem
            .value()
            .attr("id")
            .and_then(|id_str| {
                id_regex.captures(id_str)
                    .and_then(|caps| caps.get(1))
                    .map(|m| m.as_str())
            })
            .unwrap_or("unknown")
            .to_string();
        
        // Extract name
        let name = product_elem
            .select(&name_selector)
            .next()
            .map(|elem| elem.inner_html())
            .unwrap_or_default();
        
        // Extract and parse price
        let price = product_elem
            .select(&price_selector)
            .next()
            .and_then(|elem| {
                let text = elem.inner_html();
                price_regex
                    .captures(&text)
                    .and_then(|caps| caps.get(1))
                    .and_then(|m| m.as_str().parse::<f64>().ok())
            })
            .unwrap_or(0.0);
        
        // Extract rating
        let rating = product_elem
            .select(&rating_selector)
            .next()
            .and_then(|elem| {
                let text = elem.inner_html();
                rating_regex
                    .captures(&text)
                    .and_then(|caps| caps.get(1))
                    .and_then(|m| m.as_str().parse::<f64>().ok())
            })
            .unwrap_or(0.0);
        
        // Extract stock status
        let in_stock = product_elem
            .select(&stock_selector)
            .next()
            .map(|elem| {
                let text = elem.inner_html().to_lowercase();
                text.contains("in stock") || text.contains("available")
            })
            .unwrap_or(false);
        
        products.push(Product {
            id,
            name,
            price,
            rating,
            in_stock,
        });
    }
    
    Ok(products)
}

Advanced Techniques

Pagination Handling

// filepath: src/pagination.rs
use reqwest::Client;
use scraper::Html;

pub async fn scrape_paginated(
    base_url: &str,
    total_pages: usize,
) -> Result<Vec<String>, Box<dyn std::error::Error>> {
    let client = Client::new();
    let mut all_data = Vec::new();
    
    for page in 1..=total_pages {
        // Construct page URL
        let url = if base_url.contains("?") {
            format!("{}&page={}", base_url, page)
        } else {
            format!("{}?page={}", base_url, page)
        };
        
        println!("Scraping page {} of {}", page, total_pages);
        
        let response = client.get(&url).send().await?;
        let html = response.text().await?;
        
        // Extract data
        let document = Html::parse_document(&html);
        
        // Store extracted data
        // all_data.push(extract_page_data(&document)?);
        
        // Polite delay
        tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
    }
    
    Ok(all_data)
}

// Alternative: Auto-detect next page
pub async fn scrape_paginated_auto(
    start_url: &str,
) -> Result<Vec<String>, Box<dyn std::error::Error>> {
    let client = Client::new();
    let mut all_data = Vec::new();
    let mut current_url = start_url.to_string();
    
    loop {
        println!("Scraping: {}", current_url);
        
        let response = client.get(&current_url).send().await?;
        let html = response.text().await?;
        let document = Html::parse_document(&html);
        
        // Extract data from current page
        // all_data.push(extract_page_data(&document)?);
        
        // Find next page link
        let next_selector = scraper::Selector::parse("a.next-page").unwrap();
        
        match document
            .select(&next_selector)
            .next()
            .and_then(|elem| elem.value().attr("href"))
        {
            Some(next_link) => {
                current_url = if next_link.starts_with("http") {
                    next_link.to_string()
                } else {
                    format!("{}{}", start_url.split('/').take(3).collect::<Vec<_>>().join("/"), next_link)
                };
            }
            None => break, // No more pages
        }
        
        tokio::time::sleep(tokio::time::Duration::from_millis(500)).await;
    }
    
    Ok(all_data)
}
// filepath: src/sessions.rs
use reqwest::{Client, cookie::Jar};
use std::sync::Arc;
use url::Url;

pub async fn scrape_with_login(
    login_url: &str,
    username: &str,
    password: &str,
    target_url: &str,
) -> Result<String, Box<dyn std::error::Error>> {
    // Create client with cookie jar
    let jar = Arc::new(Jar::default());
    let client = Client::builder()
        .cookie_provider(jar.clone())
        .build()?;
    
    // Login first
    let login_page = client.get(login_url).send().await?;
    println!("Login page status: {}", login_page.status());
    
    // Submit login form
    let params = [("username", username), ("password", password)];
    let response = client
        .post(login_url)
        .form(&params)
        .send()
        .await?;
    
    println!("Login response status: {}", response.status());
    
    // Now access protected content
    let target_response = client.get(target_url).send().await?;
    let html = target_response.text().await?;
    
    Ok(html)
}

Proxy and User-Agent Rotation

// filepath: src/proxies.rs
use reqwest::Client;
use rand::Rng;

const USER_AGENTS: &[&str] = &[
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0",
];

pub fn get_random_user_agent() -> &'static str {
    let mut rng = rand::thread_rng();
    let idx = rng.gen_range(0..USER_AGENTS.len());
    USER_AGENTS[idx]
}

pub async fn scrape_with_rotation(
    urls: Vec<&str>,
    proxies: Vec<&str>,
) -> Result<Vec<String>, Box<dyn std::error::Error>> {
    let mut rng = rand::thread_rng();
    let mut results = Vec::new();
    
    for url in urls {
        // Rotate user agent
        let user_agent = get_random_user_agent();
        
        // Rotate proxy
        let proxy = proxies[rng.gen_range(0..proxies.len())];
        
        let client = Client::builder()
            .user_agent(user_agent)
            .proxy(reqwest::Proxy::https(proxy)?)
            .build()?;
        
        let response = client.get(url).send().await?;
        results.push(response.text().await?);
        
        println!("Fetched {} with UA: {}", url, user_agent);
    }
    
    Ok(results)
}

Complete Scraping Project

// filepath: src/main.rs
use reqwest::Client;
use scraper::{Html, Selector};
use serde::{Deserialize, Serialize};
use std::time::Duration;
use tokio::time::sleep;

#[derive(Debug, Serialize, Deserialize)]
struct NewsItem {
    title: String,
    link: String,
    date: String,
}

pub struct Scraper {
    client: Client,
    base_url: String,
}

impl Scraper {
    pub fn new(base_url: &str) -> Self {
        let client = Client::builder()
            .user_agent("Mozilla/5.0 (Rust Web Scraper)")
            .timeout(Duration::from_secs(10))
            .build()
            .expect("Failed to create client");
        
        Scraper {
            client,
            base_url: base_url.to_string(),
        }
    }
    
    pub async fn scrape_news(&self) -> Result<Vec<NewsItem>, Box<dyn std::error::Error>> {
        let response = self.client.get(&self.base_url).send().await?;
        let html = response.text().await?;
        let document = Html::parse_document(&html);
        
        let item_selector = Selector::parse("article.news-item").unwrap();
        let title_selector = Selector::parse("h2.title").unwrap();
        let link_selector = Selector::parse("a.link").unwrap();
        let date_selector = Selector::parse("time").unwrap();
        
        let mut items = Vec::new();
        
        for item in document.select(&item_selector) {
            let title = item
                .select(&title_selector)
                .next()
                .map(|e| e.inner_html())
                .unwrap_or_default();
            
            let link = item
                .select(&link_selector)
                .next()
                .and_then(|e| e.value().attr("href"))
                .unwrap_or_default()
                .to_string();
            
            let date = item
                .select(&date_selector)
                .next()
                .and_then(|e| e.value().attr("datetime"))
                .unwrap_or_default()
                .to_string();
            
            items.push(NewsItem { title, link, date });
        }
        
        Ok(items)
    }
}

#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
    let scraper = Scraper::new("https://example.com/news");
    
    let items = scraper.scrape_news().await?;
    
    for item in items {
        println!("{} - {} ({})", item.title, item.link, item.date);
    }
    
    Ok(())
}

Ethical Scraping Guidelines

Respecting Robots.txt

pub async fn check_robots_txt(domain: &str) -> Result<String, Box<dyn std::error::Error>> {
    let client = reqwest::Client::new();
    let response = client
        .get(&format!("https://{}/robots.txt", domain))
        .send()
        .await?;
    
    let content = response.text().await?;
    println!("robots.txt content:\n{}", content);
    Ok(content)
}

Best Practices

  1. Check robots.txt before scraping
  2. Respect User-Agent rules in robots.txt
  3. Use appropriate delays between requests (500ms-1s)
  4. Identify yourself with a descriptive User-Agent
  5. Don’t overload servers - use connection pooling
  6. Cache responses to avoid repeated requests
  7. Handle 429 (Too Many Requests) gracefully
  8. Check Terms of Service before scraping
// Respectful scraper
pub async fn respectful_scraper(url: &str) -> Result<String, Box<dyn std::error::Error>> {
    let client = reqwest::Client::builder()
        .user_agent("MyBot/1.0 (+https://example.com/bot)")
        .timeout(Duration::from_secs(10))
        .build()?;
    
    let response = client.get(url).send().await?;
    
    // Check for rate limiting
    if response.status() == 429 {
        eprintln!("Rate limited! Waiting before retry...");
        sleep(Duration::from_secs(60)).await;
    }
    
    Ok(response.text().await?)
}

Performance Optimization

Caching Responses

use std::collections::HashMap;
use std::time::{Duration, Instant};

pub struct Cache {
    data: HashMap<String, (String, Instant)>,
    ttl: Duration,
}

impl Cache {
    pub fn new(ttl_secs: u64) -> Self {
        Cache {
            data: HashMap::new(),
            ttl: Duration::from_secs(ttl_secs),
        }
    }
    
    pub fn get(&self, key: &str) -> Option<String> {
        self.data.get(key).and_then(|(value, time)| {
            if time.elapsed() < self.ttl {
                Some(value.clone())
            } else {
                None
            }
        })
    }
    
    pub fn insert(&mut self, key: String, value: String) {
        self.data.insert(key, (value, Instant::now()));
    }
}

Error Handling

use thiserror::Error;

#[derive(Error, Debug)]
pub enum ScraperError {
    #[error("Network error: {0}")]
    NetworkError(#[from] reqwest::Error),
    
    #[error("Parse error: {0}")]
    ParseError(String),
    
    #[error("Not found: {0}")]
    NotFound(String),
    
    #[error("Rate limited, retry after {0} seconds")]
    RateLimited(u64),
}

pub async fn robust_fetch(url: &str) -> Result<String, ScraperError> {
    let client = reqwest::Client::new();
    
    let response = client.get(url)
        .send()
        .await
        .map_err(ScraperError::NetworkError)?;
    
    match response.status() {
        reqwest::StatusCode::OK => {
            response.text().await.map_err(ScraperError::NetworkError)
        }
        reqwest::StatusCode::TOO_MANY_REQUESTS => {
            Err(ScraperError::RateLimited(60))
        }
        reqwest::StatusCode::NOT_FOUND => {
            Err(ScraperError::NotFound(url.to_string()))
        }
        status => {
            Err(ScraperError::ParseError(format!("HTTP {}", status)))
        }
    }
}

Further Resources

Libraries

Tools

  • curl: Test URLs
  • Browser DevTools: Inspect HTML structure
  • Postman: Test API endpoints
  • Charles Proxy: Monitor network traffic

Reading


Scraping Checklist

  • Check robots.txt and Terms of Service
  • Use appropriate User-Agent headers
  • Implement rate limiting/delays
  • Handle errors gracefully
  • Respect server load
  • Cache responses when possible
  • Handle JavaScript rendering if needed
  • Test with real data before full run
  • Monitor rate limiting (429 responses)
  • Log progress and errors
  • Document data sources

Conclusion

Rust is excellent for web scraping because of its:

  1. Performance - Handle large datasets efficiently
  2. Safety - Memory-safe concurrent scraping
  3. Control - Low-level HTTP and parsing control
  4. Concurrency - Scale to thousands of parallel requests
  5. Reliability - Robust error handling

Always scrape responsibly and ethically!



Scrape responsibly! 🕷️

Comments