use scraper::{Html,Selector};
use url::{Url,Host};
use std::sync::{Arc,Mutex};
use tokio::task::JoinHandle;
use futures::future;
const WORKERS : usize = 8;
type SiteStat = (Url, Vec);
#[tokio::main]
async fn main() -> Result<(), Box> {
let addr = std::env::args().nth(1)
.unwrap_or_else(|| "https://www.tmplab.org".to_string());
let links = get_links(addr.as_ref()).await?;
let addr = Url::parse(addr.as_ref())?;
let mut links : Vec = links.into_iter()
.filter(| url | url.host() != addr.host())
.collect();
let links = Arc::new(Mutex::new(links));
let joins = (0..WORKERS)
.map(| n | spawn_worker(n, &links));
let results : Vec<_>= future::join_all(joins).await
.into_iter()
.filter_map(| r | r.ok())
.flatten()
.collect();
for (url, links) in &results {
println!("{url} : {} links", links.len())
}
println!("TOTAL: {}", results.len());
Ok(())
}
// interestingly, this function must not be async...
fn spawn_worker(
n: usize,
links: &Arc>>
) -> JoinHandle> {
println!("Spawning worker {n}");
let links = links.clone();
tokio::spawn(async move { looper(links).await })
}
async fn looper(links: Arc>>) -> Vec {
let mut results = vec![];
loop {
let url = {
let mut v = links.lock().unwrap();
let Some(url) = v.pop() else {
break;
};
url
};
println!("Start fetching {url}...");
let res = match get_links(url.as_ref()).await {
Err(_) => "nope",
Ok(links) => {
results.push((url.clone(), links));
"YEA!"
}
};
println!("Got {url} => {res}");
}
results
}
async fn get_links(
url: &str
) -> Result, reqwest::Error> {
let a_selector = Selector::parse("a[href]").unwrap();
let body = reqwest::get(url)
.await?
.text()
.await?;
Ok(Html::parse_document(&body)
.select(&a_selector)
.filter_map(| link | link.value().attr("href")
.and_then(| href | Url::parse(href).ok()))
.collect())
}