use scraper::{Html,Selector}; use url::Url; use tokio::sync::mpsc::{self, Sender}; //, Receiver}; const WORKERS : usize = 8; type SiteStat = (Url, Vec); #[tokio::main] async fn main() -> Result<(), Box> { let addr = std::env::args().nth(1) .unwrap_or_else(|| "https://www.tmplab.org".to_string()); let links = get_links(addr.as_ref()).await?; let addr = Url::parse(addr.as_ref())?; let mut links : Vec = links.into_iter() .filter(| url | url.host() != addr.host()) .collect(); let (tx, mut rx) = mpsc::channel(32); // spawn a pool of workers let mut workers : Vec>> = (0..WORKERS) .map(| n | Some(spawn_worker(n, tx.clone()))) .collect(); // send 1 url to each worker to get things started for tx in &workers { if let Some(url) = links.pop() { tx.as_ref().unwrap().send(url).await.unwrap(); } } let mut results = vec![]; // recieve the results and send back new url to the workers while let Some((idx, res)) = rx.recv().await { if let Some(res) = res { results.push(res); } if let Some(addr) = links.pop() { workers[idx].as_ref().unwrap().send(addr).await.unwrap(); } else { workers[idx].take().unwrap(); if workers.iter().filter(| tx | tx.is_some()).count() == 0 { break; } } } for (url, links) in &results { println!("{url} : {} links", links.len()) } println!("TOTAL: {}", results.len()); Ok(()) } fn spawn_worker( id: usize, tx: Sender<(usize,Option)> ) -> Sender { let (tx1, mut rx) = mpsc::channel::(16); tokio::spawn(async move { println!("Start worker {id}"); while let Some(url) = rx.recv().await { println!("Start fetching {url}"); let res = get_links(url.as_ref()).await .map(| v | (url.clone(), v)).ok(); println!("Got {url}"); tx.send((id, res)).await.unwrap() } println!("Terminate worker {id}"); }); tx1 } async fn get_links( url: &str ) -> Result, reqwest::Error> { let a_selector = Selector::parse("a[href]").unwrap(); let body = reqwest::get(url) .await? .text() .await?; Ok(Html::parse_document(&body) .select(&a_selector) .filter_map(| link | link.value().attr("href") .and_then(| href | Url::parse(href).ok())) .collect()) }