initial commit
This commit is contained in:
commit
3e424a86bb
13
Cargo.toml
Normal file
13
Cargo.toml
Normal file
@ -0,0 +1,13 @@
|
||||
[package]
|
||||
name = "crawler"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
futures = "0.3.28"
|
||||
reqwest = { version = "0.11.22", features = ["rustls"] }
|
||||
scraper = "0.17.1"
|
||||
tokio = { version = "1.33.0", features = ["macros", "rt-multi-thread", "tracing"] }
|
||||
url = "2.4.1"
|
65
examples/livecoding.rs
Normal file
65
examples/livecoding.rs
Normal file
@ -0,0 +1,65 @@
|
||||
use scraper::{Html,Selector};
|
||||
use url::{Url,Host};
|
||||
//use tokio::sync::mpsc::{self, Sender, Receiver};
|
||||
use std::sync::{Arc,Mutex};
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
|
||||
let links = get_links("https://tmplab.org").await?;
|
||||
let links : Vec<Url> = links.into_iter()
|
||||
.filter(| url | url.host() != Some(Host::Domain("www.tmplab.org")))
|
||||
.collect();
|
||||
|
||||
let links1 = Arc::new(Mutex::new(links));
|
||||
let links2 = links1.clone();
|
||||
let links3 = links1.clone();
|
||||
let links4 = links1.clone();
|
||||
|
||||
let h1 = tokio::spawn(async move { looper(links1).await; });
|
||||
let h2 = tokio::spawn(async move { looper(links2).await; });
|
||||
let h3 = tokio::spawn(async move { looper(links3).await; });
|
||||
let h4 = tokio::spawn(async move { looper(links4).await; });
|
||||
|
||||
h1.await?;
|
||||
h2.await?;
|
||||
h3.await?;
|
||||
h4.await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn looper(links: Arc<Mutex<Vec<Url>>>) {
|
||||
loop {
|
||||
let url = {
|
||||
let mut v = links.lock().unwrap();
|
||||
if let Some(url) = v.pop() {
|
||||
url
|
||||
} else{
|
||||
return;
|
||||
}
|
||||
};
|
||||
let res = match get_links(&url.to_string()).await {
|
||||
Err(_) => "nope",
|
||||
Ok(_) => "YEA!"
|
||||
};
|
||||
println!("{url} => {res}");
|
||||
}
|
||||
}
|
||||
|
||||
async fn get_links(
|
||||
url: &str
|
||||
) -> Result<Vec<Url>, reqwest::Error> {
|
||||
let a_selector = Selector::parse("a[href]").unwrap();
|
||||
|
||||
let body = reqwest::get(url)
|
||||
.await?
|
||||
.text()
|
||||
.await?;
|
||||
|
||||
Ok(Html::parse_document(&body)
|
||||
.select(&a_selector)
|
||||
.filter_map(| link | link.value().attr("href")
|
||||
.and_then(| href | Url::parse(href).ok()))
|
||||
.collect())
|
||||
}
|
88
examples/livecoding_cleaner.rs
Normal file
88
examples/livecoding_cleaner.rs
Normal file
@ -0,0 +1,88 @@
|
||||
use scraper::{Html,Selector};
|
||||
use url::{Url,Host};
|
||||
//use tokio::sync::mpsc::{self, Sender, Receiver};
|
||||
use std::sync::{Arc,Mutex};
|
||||
use tokio::task::JoinHandle;
|
||||
use futures::future;
|
||||
|
||||
const WORKERS : usize = 8;
|
||||
|
||||
type SiteStat = (Url, Vec<Url>);
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let addr = std::env::args().nth(1)
|
||||
.unwrap_or_else(|| "www.tmplab.org".to_string());
|
||||
|
||||
let links = get_links("https://tmplab.org").await?;
|
||||
let links : Vec<Url> = links.into_iter()
|
||||
.filter(| url | url.host() != Some(Host::Domain(&addr)))
|
||||
.collect();
|
||||
|
||||
let links = Arc::new(Mutex::new(links));
|
||||
|
||||
let joins = (0..WORKERS)
|
||||
.map(| n | spawn_worker(n, &links));
|
||||
|
||||
let results = future::join_all(joins).await
|
||||
.into_iter()
|
||||
.filter_map(| r | r.ok())
|
||||
.flatten();
|
||||
|
||||
for (url, links) in results {
|
||||
println!("{url} : {} links", links.len())
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// interestingly, this function must not be async...
|
||||
fn spawn_worker(
|
||||
n: usize,
|
||||
links: &Arc<Mutex<Vec<Url>>>
|
||||
) -> JoinHandle<Vec<SiteStat>> {
|
||||
println!("Spawning worker {n}");
|
||||
let links = links.clone();
|
||||
tokio::spawn(async move { looper(links).await })
|
||||
}
|
||||
|
||||
async fn looper(links: Arc<Mutex<Vec<Url>>>) -> Vec<SiteStat> {
|
||||
let mut results = vec![];
|
||||
|
||||
loop {
|
||||
let url = {
|
||||
let mut v = links.lock().unwrap();
|
||||
let Some(url) = v.pop() else {
|
||||
break;
|
||||
};
|
||||
url
|
||||
};
|
||||
println!("Start fetching {url}...");
|
||||
let res = match get_links(url.as_ref()).await {
|
||||
Err(_) => "nope",
|
||||
Ok(links) => {
|
||||
results.push((url.clone(), links));
|
||||
"YEA!"
|
||||
}
|
||||
};
|
||||
println!("Got {url} => {res}");
|
||||
}
|
||||
results
|
||||
}
|
||||
|
||||
async fn get_links(
|
||||
url: &str
|
||||
) -> Result<Vec<Url>, reqwest::Error> {
|
||||
let a_selector = Selector::parse("a[href]").unwrap();
|
||||
|
||||
let body = reqwest::get(url)
|
||||
.await?
|
||||
.text()
|
||||
.await?;
|
||||
|
||||
Ok(Html::parse_document(&body)
|
||||
.select(&a_selector)
|
||||
.filter_map(| link | link.value().attr("href")
|
||||
.and_then(| href | Url::parse(href).ok()))
|
||||
.collect())
|
||||
}
|
56
examples/livecoding_simple.rs
Normal file
56
examples/livecoding_simple.rs
Normal file
@ -0,0 +1,56 @@
|
||||
use scraper::{Html,Selector};
|
||||
use url::{Url,Host};
|
||||
use futures::future;
|
||||
|
||||
type SiteStat = (Url, Vec<Url>);
|
||||
|
||||
#[tokio::main]
|
||||
async fn main() -> Result<(), Box<dyn std::error::Error>> {
|
||||
let addr = std::env::args().nth(1)
|
||||
.unwrap_or_else(|| "www.tmplab.org".to_string());
|
||||
|
||||
let links = get_links("https://tmplab.org").await?;
|
||||
let links : Vec<Url> = links.into_iter()
|
||||
.filter(| url | url.host() != Some(Host::Domain(&addr)))
|
||||
.collect();
|
||||
|
||||
let joins = links.into_iter()
|
||||
.map(| url | {
|
||||
tokio::spawn(async move { fetch_site(url).await })
|
||||
});
|
||||
|
||||
let results = future::join_all(joins).await
|
||||
.into_iter()
|
||||
.filter_map(| r | r.ok());
|
||||
|
||||
for (url, links) in results {
|
||||
println!("{url} : {} links", links.len())
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
async fn fetch_site(url: Url) -> SiteStat {
|
||||
println!("Start fetching {url}");
|
||||
let links = get_links(url.as_ref()).await
|
||||
.unwrap_or_else(|_| vec![]);
|
||||
println!("Got {url} => {}", links.len());
|
||||
(url, links)
|
||||
}
|
||||
|
||||
async fn get_links(
|
||||
url: &str
|
||||
) -> Result<Vec<Url>, reqwest::Error> {
|
||||
let a_selector = Selector::parse("a[href]").unwrap();
|
||||
|
||||
let body = reqwest::get(url)
|
||||
.await?
|
||||
.text()
|
||||
.await?;
|
||||
|
||||
Ok(Html::parse_document(&body)
|
||||
.select(&a_selector)
|
||||
.filter_map(| link | link.value().attr("href")
|
||||
.and_then(| href | Url::parse(href).ok()))
|
||||
.collect())
|
||||
}
|
13
src/main.rs
Normal file
13
src/main.rs
Normal file
@ -0,0 +1,13 @@
|
||||
fn main() {
|
||||
println!(r#"
|
||||
This is not the code you're looking for.
|
||||
|
||||
Look into examples/
|
||||
|
||||
Try:
|
||||
|
||||
> cargo run --release --example livecoding
|
||||
> cargo run --release --example livecoding_cleaner
|
||||
> cargo run --release --example livecoding_simple
|
||||
"#);
|
||||
}
|
Loading…
Reference in New Issue
Block a user