use scraper::{Html,Selector}; use url::{Url,Host}; use std::sync::{Arc,Mutex}; use tokio::task::JoinHandle; use futures::future; const WORKERS : usize = 8; type SiteStat = (Url, Vec); #[tokio::main] async fn main() -> Result<(), Box> { let addr = std::env::args().nth(1) .unwrap_or_else(|| "https://www.tmplab.org".to_string()); let links = get_links(addr.as_ref()).await?; let addr = Url::parse(addr.as_ref())?; let mut links : Vec = links.into_iter() .filter(| url | url.host() != addr.host()) .collect(); let links = Arc::new(Mutex::new(links)); let joins = (0..WORKERS) .map(| n | spawn_worker(n, &links)); let results : Vec<_>= future::join_all(joins).await .into_iter() .filter_map(| r | r.ok()) .flatten() .collect(); for (url, links) in &results { println!("{url} : {} links", links.len()) } println!("TOTAL: {}", results.len()); Ok(()) } // interestingly, this function must not be async... fn spawn_worker( n: usize, links: &Arc>> ) -> JoinHandle> { println!("Spawning worker {n}"); let links = links.clone(); tokio::spawn(async move { looper(links).await }) } async fn looper(links: Arc>>) -> Vec { let mut results = vec![]; loop { let url = { let mut v = links.lock().unwrap(); let Some(url) = v.pop() else { break; }; url }; println!("Start fetching {url}..."); let res = match get_links(url.as_ref()).await { Err(_) => "nope", Ok(links) => { results.push((url.clone(), links)); "YEA!" } }; println!("Got {url} => {res}"); } results } async fn get_links( url: &str ) -> Result, reqwest::Error> { let a_selector = Selector::parse("a[href]").unwrap(); let body = reqwest::get(url) .await? .text() .await?; Ok(Html::parse_document(&body) .select(&a_selector) .filter_map(| link | link.value().attr("href") .and_then(| href | Url::parse(href).ok())) .collect()) }