fix(scraper): additionally grab common words
This commit is contained in:
parent
62717ef6b2
commit
6c64ebd56b
46
Cargo.lock
generated
46
Cargo.lock
generated
@ -71,6 +71,12 @@ version = "1.0.9"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "8365de52b16c035ff4fcafe0092ba9390540e3e352870ac09933bebcaa2c8c56"
|
checksum = "8365de52b16c035ff4fcafe0092ba9390540e3e352870ac09933bebcaa2c8c56"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "anyhow"
|
||||||
|
version = "1.0.93"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4c95c10ba0b00a02636238b814946408b1322d5ac4760326e6fb8ec956d85775"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "arc-swap"
|
name = "arc-swap"
|
||||||
version = "1.7.1"
|
version = "1.7.1"
|
||||||
@ -307,6 +313,18 @@ version = "0.7.2"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
|
checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "common-words-all"
|
||||||
|
version = "0.0.2"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "84a6ff47eb813c9e315610ceca0ddd247827e22f2cdadc4189e4676a81470c77"
|
||||||
|
dependencies = [
|
||||||
|
"anyhow",
|
||||||
|
"csv",
|
||||||
|
"glob",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "cookie"
|
name = "cookie"
|
||||||
version = "0.18.1"
|
version = "0.18.1"
|
||||||
@ -370,6 +388,27 @@ dependencies = [
|
|||||||
"typenum",
|
"typenum",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "csv"
|
||||||
|
version = "1.3.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "acdc4883a9c96732e4733212c01447ebd805833b7275a73ca3ee080fd77afdaf"
|
||||||
|
dependencies = [
|
||||||
|
"csv-core",
|
||||||
|
"itoa",
|
||||||
|
"ryu",
|
||||||
|
"serde",
|
||||||
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "csv-core"
|
||||||
|
version = "0.1.11"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "5efa2b3d7902f4b634a20cae3c9c4e6209dc4779feb6863329607560143efa70"
|
||||||
|
dependencies = [
|
||||||
|
"memchr",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "darling"
|
name = "darling"
|
||||||
version = "0.20.10"
|
version = "0.20.10"
|
||||||
@ -642,6 +681,12 @@ version = "0.31.1"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
|
checksum = "07e28edb80900c19c28f1072f2e8aeca7fa06b23cd4169cefe1af5aa3260783f"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "glob"
|
||||||
|
version = "0.3.1"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "globset"
|
name = "globset"
|
||||||
version = "0.4.15"
|
version = "0.4.15"
|
||||||
@ -1160,6 +1205,7 @@ dependencies = [
|
|||||||
"build_html",
|
"build_html",
|
||||||
"cached",
|
"cached",
|
||||||
"clap",
|
"clap",
|
||||||
|
"common-words-all",
|
||||||
"cookie",
|
"cookie",
|
||||||
"dotenvy",
|
"dotenvy",
|
||||||
"fastrand",
|
"fastrand",
|
||||||
|
@ -48,6 +48,7 @@ rss = "2.0.7"
|
|||||||
arc-swap = "1.7.1"
|
arc-swap = "1.7.1"
|
||||||
serde_json_path = "0.6.7"
|
serde_json_path = "0.6.7"
|
||||||
async-recursion = "1.1.1"
|
async-recursion = "1.1.1"
|
||||||
|
common-words-all = { version = "0.0.2", default-features = false, features = ["english", "one"] }
|
||||||
|
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
use std::{fmt::Display, io::Write};
|
use std::{collections::HashMap, fmt::Display, io::Write};
|
||||||
|
|
||||||
use clap::{Parser, ValueEnum};
|
use clap::{Parser, ValueEnum};
|
||||||
|
use common_words_all::{get_top, Language, NgramSize};
|
||||||
use redlib::utils::Post;
|
use redlib::utils::Post;
|
||||||
|
|
||||||
#[derive(Parser)]
|
#[derive(Parser)]
|
||||||
@ -10,9 +11,6 @@ struct Cli {
|
|||||||
#[arg(short = 's', long = "sub")]
|
#[arg(short = 's', long = "sub")]
|
||||||
sub: String,
|
sub: String,
|
||||||
|
|
||||||
#[arg(short = 'c', long = "count")]
|
|
||||||
count: usize,
|
|
||||||
|
|
||||||
#[arg(long = "sort")]
|
#[arg(long = "sort")]
|
||||||
sort: SortOrder,
|
sort: SortOrder,
|
||||||
|
|
||||||
@ -50,28 +48,85 @@ enum Format {
|
|||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() {
|
async fn main() {
|
||||||
|
pretty_env_logger::init();
|
||||||
let cli = Cli::parse();
|
let cli = Cli::parse();
|
||||||
let (sub, final_count, sort, format, output) = (cli.sub, cli.count, cli.sort, cli.format, cli.output);
|
let (sub, sort, format, output) = (cli.sub, cli.sort, cli.format, cli.output);
|
||||||
let initial = format!("/r/{sub}/{sort}.json?&raw_json=1");
|
let initial = format!("/r/{sub}/{sort}.json?&raw_json=1");
|
||||||
let (mut posts, mut after) = Post::fetch(&initial, false).await.unwrap();
|
let (posts, mut after) = Post::fetch(&initial, false).await.unwrap();
|
||||||
while posts.len() < final_count {
|
let mut hashmap = HashMap::new();
|
||||||
|
hashmap.extend(posts.into_iter().map(|post| (post.id.clone(), post)));
|
||||||
|
loop {
|
||||||
print!("\r");
|
print!("\r");
|
||||||
let path = format!("/r/{sub}/{sort}.json?sort={sort}&t=&after={after}&raw_json=1");
|
let path = format!("/r/{sub}/{sort}.json?sort={sort}&t=&after={after}&raw_json=1");
|
||||||
let (new_posts, new_after) = Post::fetch(&path, false).await.unwrap();
|
let (new_posts, new_after) = Post::fetch(&path, false).await.unwrap();
|
||||||
posts.extend(new_posts);
|
let old_len = hashmap.len();
|
||||||
after = new_after;
|
// convert to hashmap and extend hashmap
|
||||||
// Print number of posts fetched
|
let new_posts = new_posts.into_iter().map(|post| (post.id.clone(), post)).collect::<HashMap<String, Post>>();
|
||||||
print!("Fetched {} posts", posts.len());
|
let len = new_posts.len();
|
||||||
std::io::stdout().flush().unwrap();
|
hashmap.extend(new_posts);
|
||||||
|
if hashmap.len() - old_len < 3 {
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
posts.truncate(final_count);
|
let x = hashmap.len() - old_len;
|
||||||
|
after = new_after;
|
||||||
|
// Print number of posts fetched
|
||||||
|
print!("Fetched {len} posts (+{x})",);
|
||||||
|
std::io::stdout().flush().unwrap();
|
||||||
|
}
|
||||||
|
println!("\n\n");
|
||||||
|
// additionally search if final count not reached
|
||||||
|
|
||||||
|
for word in get_top(Language::English, 10_000, NgramSize::One) {
|
||||||
|
let mut retrieved_posts_from_search = 0;
|
||||||
|
let initial = format!("/r/{sub}/search.json?q={word}&restrict_sr=on&include_over_18=on&raw_json=1&sort={sort}");
|
||||||
|
println!("Grabbing posts with word {word}.");
|
||||||
|
let (posts, mut after) = Post::fetch(&initial, false).await.unwrap();
|
||||||
|
hashmap.extend(posts.into_iter().map(|post| (post.id.clone(), post)));
|
||||||
|
'search: loop {
|
||||||
|
let path = format!("/r/{sub}/search.json?q={word}&restrict_sr=on&include_over_18=on&raw_json=1&sort={sort}&after={after}");
|
||||||
|
let (new_posts, new_after) = Post::fetch(&path, false).await.unwrap();
|
||||||
|
if new_posts.is_empty() || new_after.is_empty() {
|
||||||
|
println!("No more posts for word {word}");
|
||||||
|
break 'search;
|
||||||
|
}
|
||||||
|
retrieved_posts_from_search += new_posts.len();
|
||||||
|
let old_len = hashmap.len();
|
||||||
|
let new_posts = new_posts.into_iter().map(|post| (post.id.clone(), post)).collect::<HashMap<String, Post>>();
|
||||||
|
let len = new_posts.len();
|
||||||
|
hashmap.extend(new_posts);
|
||||||
|
let delta = hashmap.len() - old_len;
|
||||||
|
after = new_after;
|
||||||
|
// Print number of posts fetched
|
||||||
|
println!("Fetched {len} posts (+{delta})",);
|
||||||
|
|
||||||
|
if retrieved_posts_from_search > 1000 {
|
||||||
|
println!("Reached 1000 posts from search");
|
||||||
|
break 'search;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Need to save incrementally. atomic save + move
|
||||||
|
let tmp_file = output.clone().unwrap_or_else(|| format!("{sub}.json.tmp"));
|
||||||
|
let perm_file = output.clone().unwrap_or_else(|| format!("{sub}.json"));
|
||||||
|
write_posts(&hashmap.values().collect(), tmp_file.clone());
|
||||||
|
// move file
|
||||||
|
std::fs::rename(tmp_file, perm_file).unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
println!("\n\n");
|
||||||
|
|
||||||
|
println!("Size of hashmap: {}", hashmap.len());
|
||||||
|
|
||||||
|
let posts: Vec<&Post> = hashmap.values().collect();
|
||||||
match format {
|
match format {
|
||||||
Format::Json => {
|
Format::Json => {
|
||||||
let filename: String = output.unwrap_or_else(|| format!("{sub}.json"));
|
let filename: String = output.unwrap_or_else(|| format!("{sub}.json"));
|
||||||
|
write_posts(&posts, filename);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn write_posts(posts: &Vec<&Post>, filename: String) {
|
||||||
let json = serde_json::to_string(&posts).unwrap();
|
let json = serde_json::to_string(&posts).unwrap();
|
||||||
std::fs::write(filename, json).unwrap();
|
std::fs::write(filename, json).unwrap();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
|
Loading…
Reference in New Issue
Block a user