use scraper::{Html,Selector};
use url::{Url,Host};
use futures::future;
type SiteStat = (Url, Vec);
#[tokio::main]
async fn main() -> Result<(), Box> {
let addr = std::env::args().nth(1)
.unwrap_or_else(|| "www.tmplab.org".to_string());
let links = get_links("https://tmplab.org").await?;
let links : Vec = links.into_iter()
.filter(| url | url.host() != Some(Host::Domain(&addr)))
.collect();
let joins = links.into_iter()
.map(| url | {
tokio::spawn(async move { fetch_site(url).await })
});
let results = future::join_all(joins).await
.into_iter()
.filter_map(| r | r.ok());
for (url, links) in results {
println!("{url} : {} links", links.len())
}
Ok(())
}
async fn fetch_site(url: Url) -> SiteStat {
println!("Start fetching {url}");
let links = get_links(url.as_ref()).await
.unwrap_or_else(|_| vec![]);
println!("Got {url} => {}", links.len());
(url, links)
}
async fn get_links(
url: &str
) -> Result, reqwest::Error> {
let a_selector = Selector::parse("a[href]").unwrap();
let body = reqwest::get(url)
.await?
.text()
.await?;
Ok(Html::parse_document(&body)
.select(&a_selector)
.filter_map(| link | link.value().attr("href")
.and_then(| href | Url::parse(href).ok()))
.collect())
}