initial commit

This commit is contained in:
Marc Planard 2023-10-13 12:30:32 +02:00
commit 3e424a86bb
5 changed files with 235 additions and 0 deletions

13
Cargo.toml Normal file
View File

@ -0,0 +1,13 @@
[package]
name = "crawler"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
futures = "0.3.28"
reqwest = { version = "0.11.22", features = ["rustls"] }
scraper = "0.17.1"
tokio = { version = "1.33.0", features = ["macros", "rt-multi-thread", "tracing"] }
url = "2.4.1"

65
examples/livecoding.rs Normal file
View File

@ -0,0 +1,65 @@
use scraper::{Html,Selector};
use url::{Url,Host};
//use tokio::sync::mpsc::{self, Sender, Receiver};
use std::sync::{Arc,Mutex};
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let links = get_links("https://tmplab.org").await?;
let links : Vec<Url> = links.into_iter()
.filter(| url | url.host() != Some(Host::Domain("www.tmplab.org")))
.collect();
let links1 = Arc::new(Mutex::new(links));
let links2 = links1.clone();
let links3 = links1.clone();
let links4 = links1.clone();
let h1 = tokio::spawn(async move { looper(links1).await; });
let h2 = tokio::spawn(async move { looper(links2).await; });
let h3 = tokio::spawn(async move { looper(links3).await; });
let h4 = tokio::spawn(async move { looper(links4).await; });
h1.await?;
h2.await?;
h3.await?;
h4.await?;
Ok(())
}
async fn looper(links: Arc<Mutex<Vec<Url>>>) {
loop {
let url = {
let mut v = links.lock().unwrap();
if let Some(url) = v.pop() {
url
} else{
return;
}
};
let res = match get_links(&url.to_string()).await {
Err(_) => "nope",
Ok(_) => "YEA!"
};
println!("{url} => {res}");
}
}
async fn get_links(
url: &str
) -> Result<Vec<Url>, reqwest::Error> {
let a_selector = Selector::parse("a[href]").unwrap();
let body = reqwest::get(url)
.await?
.text()
.await?;
Ok(Html::parse_document(&body)
.select(&a_selector)
.filter_map(| link | link.value().attr("href")
.and_then(| href | Url::parse(href).ok()))
.collect())
}

View File

@ -0,0 +1,88 @@
use scraper::{Html,Selector};
use url::{Url,Host};
//use tokio::sync::mpsc::{self, Sender, Receiver};
use std::sync::{Arc,Mutex};
use tokio::task::JoinHandle;
use futures::future;
const WORKERS : usize = 8;
type SiteStat = (Url, Vec<Url>);
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let addr = std::env::args().nth(1)
.unwrap_or_else(|| "www.tmplab.org".to_string());
let links = get_links("https://tmplab.org").await?;
let links : Vec<Url> = links.into_iter()
.filter(| url | url.host() != Some(Host::Domain(&addr)))
.collect();
let links = Arc::new(Mutex::new(links));
let joins = (0..WORKERS)
.map(| n | spawn_worker(n, &links));
let results = future::join_all(joins).await
.into_iter()
.filter_map(| r | r.ok())
.flatten();
for (url, links) in results {
println!("{url} : {} links", links.len())
}
Ok(())
}
// interestingly, this function must not be async...
fn spawn_worker(
n: usize,
links: &Arc<Mutex<Vec<Url>>>
) -> JoinHandle<Vec<SiteStat>> {
println!("Spawning worker {n}");
let links = links.clone();
tokio::spawn(async move { looper(links).await })
}
async fn looper(links: Arc<Mutex<Vec<Url>>>) -> Vec<SiteStat> {
let mut results = vec![];
loop {
let url = {
let mut v = links.lock().unwrap();
let Some(url) = v.pop() else {
break;
};
url
};
println!("Start fetching {url}...");
let res = match get_links(url.as_ref()).await {
Err(_) => "nope",
Ok(links) => {
results.push((url.clone(), links));
"YEA!"
}
};
println!("Got {url} => {res}");
}
results
}
async fn get_links(
url: &str
) -> Result<Vec<Url>, reqwest::Error> {
let a_selector = Selector::parse("a[href]").unwrap();
let body = reqwest::get(url)
.await?
.text()
.await?;
Ok(Html::parse_document(&body)
.select(&a_selector)
.filter_map(| link | link.value().attr("href")
.and_then(| href | Url::parse(href).ok()))
.collect())
}

View File

@ -0,0 +1,56 @@
use scraper::{Html,Selector};
use url::{Url,Host};
use futures::future;
type SiteStat = (Url, Vec<Url>);
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
let addr = std::env::args().nth(1)
.unwrap_or_else(|| "www.tmplab.org".to_string());
let links = get_links("https://tmplab.org").await?;
let links : Vec<Url> = links.into_iter()
.filter(| url | url.host() != Some(Host::Domain(&addr)))
.collect();
let joins = links.into_iter()
.map(| url | {
tokio::spawn(async move { fetch_site(url).await })
});
let results = future::join_all(joins).await
.into_iter()
.filter_map(| r | r.ok());
for (url, links) in results {
println!("{url} : {} links", links.len())
}
Ok(())
}
async fn fetch_site(url: Url) -> SiteStat {
println!("Start fetching {url}");
let links = get_links(url.as_ref()).await
.unwrap_or_else(|_| vec![]);
println!("Got {url} => {}", links.len());
(url, links)
}
async fn get_links(
url: &str
) -> Result<Vec<Url>, reqwest::Error> {
let a_selector = Selector::parse("a[href]").unwrap();
let body = reqwest::get(url)
.await?
.text()
.await?;
Ok(Html::parse_document(&body)
.select(&a_selector)
.filter_map(| link | link.value().attr("href")
.and_then(| href | Url::parse(href).ok()))
.collect())
}

13
src/main.rs Normal file
View File

@ -0,0 +1,13 @@
fn main() {
println!(r#"
This is not the code you're looking for.
Look into examples/
Try:
> cargo run --release --example livecoding
> cargo run --release --example livecoding_cleaner
> cargo run --release --example livecoding_simple
"#);
}