commit 3e424a86bbf53ccf42f9e91d68e63308f41fb73b Author: Marc Planard Date: Fri Oct 13 12:30:32 2023 +0200 initial commit diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..0ea58a5 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "crawler" +version = "0.1.0" +edition = "2021" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +futures = "0.3.28" +reqwest = { version = "0.11.22", features = ["rustls"] } +scraper = "0.17.1" +tokio = { version = "1.33.0", features = ["macros", "rt-multi-thread", "tracing"] } +url = "2.4.1" diff --git a/examples/livecoding.rs b/examples/livecoding.rs new file mode 100644 index 0000000..8b99b33 --- /dev/null +++ b/examples/livecoding.rs @@ -0,0 +1,65 @@ +use scraper::{Html,Selector}; +use url::{Url,Host}; +//use tokio::sync::mpsc::{self, Sender, Receiver}; +use std::sync::{Arc,Mutex}; + +#[tokio::main] +async fn main() -> Result<(), Box> { + + let links = get_links("https://tmplab.org").await?; + let links : Vec = links.into_iter() + .filter(| url | url.host() != Some(Host::Domain("www.tmplab.org"))) + .collect(); + + let links1 = Arc::new(Mutex::new(links)); + let links2 = links1.clone(); + let links3 = links1.clone(); + let links4 = links1.clone(); + + let h1 = tokio::spawn(async move { looper(links1).await; }); + let h2 = tokio::spawn(async move { looper(links2).await; }); + let h3 = tokio::spawn(async move { looper(links3).await; }); + let h4 = tokio::spawn(async move { looper(links4).await; }); + + h1.await?; + h2.await?; + h3.await?; + h4.await?; + + Ok(()) +} + +async fn looper(links: Arc>>) { + loop { + let url = { + let mut v = links.lock().unwrap(); + if let Some(url) = v.pop() { + url + } else{ + return; + } + }; + let res = match get_links(&url.to_string()).await { + Err(_) => "nope", + Ok(_) => "YEA!" + }; + println!("{url} => {res}"); + } +} + +async fn get_links( + url: &str +) -> Result, reqwest::Error> { + let a_selector = Selector::parse("a[href]").unwrap(); + + let body = reqwest::get(url) + .await? + .text() + .await?; + + Ok(Html::parse_document(&body) + .select(&a_selector) + .filter_map(| link | link.value().attr("href") + .and_then(| href | Url::parse(href).ok())) + .collect()) +} diff --git a/examples/livecoding_cleaner.rs b/examples/livecoding_cleaner.rs new file mode 100644 index 0000000..fdc284d --- /dev/null +++ b/examples/livecoding_cleaner.rs @@ -0,0 +1,88 @@ +use scraper::{Html,Selector}; +use url::{Url,Host}; +//use tokio::sync::mpsc::{self, Sender, Receiver}; +use std::sync::{Arc,Mutex}; +use tokio::task::JoinHandle; +use futures::future; + +const WORKERS : usize = 8; + +type SiteStat = (Url, Vec); + +#[tokio::main] +async fn main() -> Result<(), Box> { + let addr = std::env::args().nth(1) + .unwrap_or_else(|| "www.tmplab.org".to_string()); + + let links = get_links("https://tmplab.org").await?; + let links : Vec = links.into_iter() + .filter(| url | url.host() != Some(Host::Domain(&addr))) + .collect(); + + let links = Arc::new(Mutex::new(links)); + + let joins = (0..WORKERS) + .map(| n | spawn_worker(n, &links)); + + let results = future::join_all(joins).await + .into_iter() + .filter_map(| r | r.ok()) + .flatten(); + + for (url, links) in results { + println!("{url} : {} links", links.len()) + } + + Ok(()) +} + +// interestingly, this function must not be async... +fn spawn_worker( + n: usize, + links: &Arc>> +) -> JoinHandle> { + println!("Spawning worker {n}"); + let links = links.clone(); + tokio::spawn(async move { looper(links).await }) +} + +async fn looper(links: Arc>>) -> Vec { + let mut results = vec![]; + + loop { + let url = { + let mut v = links.lock().unwrap(); + let Some(url) = v.pop() else { + break; + }; + url + }; + println!("Start fetching {url}..."); + let res = match get_links(url.as_ref()).await { + Err(_) => "nope", + Ok(links) => { + results.push((url.clone(), links)); + "YEA!" + } + }; + println!("Got {url} => {res}"); + } + results +} + +async fn get_links( + url: &str +) -> Result, reqwest::Error> { + let a_selector = Selector::parse("a[href]").unwrap(); + + let body = reqwest::get(url) + .await? + .text() + .await?; + + Ok(Html::parse_document(&body) + .select(&a_selector) + .filter_map(| link | link.value().attr("href") + .and_then(| href | Url::parse(href).ok())) + .collect()) +} diff --git a/examples/livecoding_simple.rs b/examples/livecoding_simple.rs new file mode 100644 index 0000000..6d89303 --- /dev/null +++ b/examples/livecoding_simple.rs @@ -0,0 +1,56 @@ +use scraper::{Html,Selector}; +use url::{Url,Host}; +use futures::future; + +type SiteStat = (Url, Vec); + +#[tokio::main] +async fn main() -> Result<(), Box> { + let addr = std::env::args().nth(1) + .unwrap_or_else(|| "www.tmplab.org".to_string()); + + let links = get_links("https://tmplab.org").await?; + let links : Vec = links.into_iter() + .filter(| url | url.host() != Some(Host::Domain(&addr))) + .collect(); + + let joins = links.into_iter() + .map(| url | { + tokio::spawn(async move { fetch_site(url).await }) + }); + + let results = future::join_all(joins).await + .into_iter() + .filter_map(| r | r.ok()); + + for (url, links) in results { + println!("{url} : {} links", links.len()) + } + + Ok(()) +} + +async fn fetch_site(url: Url) -> SiteStat { + println!("Start fetching {url}"); + let links = get_links(url.as_ref()).await + .unwrap_or_else(|_| vec![]); + println!("Got {url} => {}", links.len()); + (url, links) +} + +async fn get_links( + url: &str +) -> Result, reqwest::Error> { + let a_selector = Selector::parse("a[href]").unwrap(); + + let body = reqwest::get(url) + .await? + .text() + .await?; + + Ok(Html::parse_document(&body) + .select(&a_selector) + .filter_map(| link | link.value().attr("href") + .and_then(| href | Url::parse(href).ok())) + .collect()) +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..131ac0e --- /dev/null +++ b/src/main.rs @@ -0,0 +1,13 @@ +fn main() { + println!(r#" +This is not the code you're looking for. + +Look into examples/ + +Try: + + > cargo run --release --example livecoding + > cargo run --release --example livecoding_cleaner + > cargo run --release --example livecoding_simple +"#); +}