feat(scraper): add scraper CLI
This commit is contained in:
parent
49ef59e000
commit
f3d2f0cc59
19
Cargo.lock
generated
19
Cargo.lock
generated
@ -268,6 +268,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|||||||
checksum = "5db83dced34638ad474f39f250d7fea9598bdd239eaced1bdf45d597da0f433f"
|
checksum = "5db83dced34638ad474f39f250d7fea9598bdd239eaced1bdf45d597da0f433f"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"clap_builder",
|
"clap_builder",
|
||||||
|
"clap_derive",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -280,6 +281,18 @@ dependencies = [
|
|||||||
"clap_lex",
|
"clap_lex",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "clap_derive"
|
||||||
|
version = "4.5.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "c780290ccf4fb26629baa7a1081e68ced113f1d3ec302fa5948f1c381ebf06c6"
|
||||||
|
dependencies = [
|
||||||
|
"heck",
|
||||||
|
"proc-macro2",
|
||||||
|
"quote",
|
||||||
|
"syn 2.0.68",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "clap_lex"
|
name = "clap_lex"
|
||||||
version = "0.7.1"
|
version = "0.7.1"
|
||||||
@ -735,6 +748,12 @@ version = "0.15.0"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb"
|
checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "heck"
|
||||||
|
version = "0.5.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "hermit-abi"
|
name = "hermit-abi"
|
||||||
version = "0.3.9"
|
version = "0.3.9"
|
||||||
|
10
Cargo.toml
10
Cargo.toml
@ -9,6 +9,7 @@ authors = [
|
|||||||
"spikecodes <19519553+spikecodes@users.noreply.github.com>",
|
"spikecodes <19519553+spikecodes@users.noreply.github.com>",
|
||||||
]
|
]
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
default-run = "redlib"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
rinja = { version = "0.3.4", default-features = false }
|
rinja = { version = "0.3.4", default-features = false }
|
||||||
@ -16,6 +17,7 @@ cached = { version = "0.51.3", features = ["async"] }
|
|||||||
clap = { version = "4.4.11", default-features = false, features = [
|
clap = { version = "4.4.11", default-features = false, features = [
|
||||||
"std",
|
"std",
|
||||||
"env",
|
"env",
|
||||||
|
"derive",
|
||||||
] }
|
] }
|
||||||
regex = "1.10.2"
|
regex = "1.10.2"
|
||||||
serde = { version = "1.0.193", features = ["derive"] }
|
serde = { version = "1.0.193", features = ["derive"] }
|
||||||
@ -56,3 +58,11 @@ sealed_test = "1.0.0"
|
|||||||
codegen-units = 1
|
codegen-units = 1
|
||||||
lto = true
|
lto = true
|
||||||
strip = "symbols"
|
strip = "symbols"
|
||||||
|
|
||||||
|
[[bin]]
|
||||||
|
name = "redlib"
|
||||||
|
path = "src/main.rs"
|
||||||
|
|
||||||
|
[[bin]]
|
||||||
|
name = "scraper"
|
||||||
|
path = "src/scraper/main.rs"
|
73
src/scraper/main.rs
Normal file
73
src/scraper/main.rs
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
use std::{fmt::Display, io::Write};
|
||||||
|
|
||||||
|
use clap::{Parser, ValueEnum};
|
||||||
|
use redlib::utils::Post;
|
||||||
|
|
||||||
|
#[derive(Parser)]
|
||||||
|
#[command(name = "my_cli")]
|
||||||
|
#[command(about = "A simple CLI example", long_about = None)]
|
||||||
|
struct Cli {
|
||||||
|
#[arg(short = 's', long = "sub")]
|
||||||
|
sub: String,
|
||||||
|
|
||||||
|
#[arg(short = 'c', long = "count")]
|
||||||
|
count: usize,
|
||||||
|
|
||||||
|
#[arg(long = "sort")]
|
||||||
|
sort: SortOrder,
|
||||||
|
|
||||||
|
#[arg(short = 'f', long = "format", value_enum)]
|
||||||
|
format: Format,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, ValueEnum)]
|
||||||
|
enum SortOrder {
|
||||||
|
Hot,
|
||||||
|
Rising,
|
||||||
|
New,
|
||||||
|
Top,
|
||||||
|
Controversial,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Display for SortOrder {
|
||||||
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||||
|
match self {
|
||||||
|
SortOrder::Hot => write!(f, "hot"),
|
||||||
|
SortOrder::Rising => write!(f, "rising"),
|
||||||
|
SortOrder::New => write!(f, "new"),
|
||||||
|
SortOrder::Top => write!(f, "top"),
|
||||||
|
SortOrder::Controversial => write!(f, "controversial"),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, ValueEnum)]
|
||||||
|
enum Format {
|
||||||
|
Json,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::main]
|
||||||
|
async fn main() {
|
||||||
|
let cli = Cli::parse();
|
||||||
|
let (sub, final_count, sort, format) = (cli.sub, cli.count, cli.sort, cli.format);
|
||||||
|
let initial = format!("/r/{sub}/{sort}.json?&raw_json=1");
|
||||||
|
let (mut posts, mut after) = Post::fetch(&initial, false).await.unwrap();
|
||||||
|
while posts.len() < final_count {
|
||||||
|
print!("\r");
|
||||||
|
let path = format!("/r/{sub}/{sort}.json?sort={sort}&t=&after={after}&raw_json=1");
|
||||||
|
let (new_posts, new_after) = Post::fetch(&path, false).await.unwrap();
|
||||||
|
posts.extend(new_posts);
|
||||||
|
after = new_after;
|
||||||
|
// Print number of posts fetched
|
||||||
|
print!("Fetched {} posts", posts.len());
|
||||||
|
std::io::stdout().flush().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
match format {
|
||||||
|
Format::Json => {
|
||||||
|
let filename: String = format!("{sub}.json");
|
||||||
|
let json = serde_json::to_string(&posts).unwrap();
|
||||||
|
std::fs::write(filename, json).unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
15
src/utils.rs
15
src/utils.rs
@ -11,6 +11,7 @@ use once_cell::sync::Lazy;
|
|||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use rinja::Template;
|
use rinja::Template;
|
||||||
use rust_embed::RustEmbed;
|
use rust_embed::RustEmbed;
|
||||||
|
use serde::Serialize;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use serde_json_path::{JsonPath, JsonPathExt};
|
use serde_json_path::{JsonPath, JsonPathExt};
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
@ -46,6 +47,7 @@ pub enum ResourceType {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Post flair with content, background color and foreground color
|
// Post flair with content, background color and foreground color
|
||||||
|
#[derive(Serialize)]
|
||||||
pub struct Flair {
|
pub struct Flair {
|
||||||
pub flair_parts: Vec<FlairPart>,
|
pub flair_parts: Vec<FlairPart>,
|
||||||
pub text: String,
|
pub text: String,
|
||||||
@ -54,7 +56,7 @@ pub struct Flair {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Part of flair, either emoji or text
|
// Part of flair, either emoji or text
|
||||||
#[derive(Clone)]
|
#[derive(Clone, Serialize)]
|
||||||
pub struct FlairPart {
|
pub struct FlairPart {
|
||||||
pub flair_part_type: String,
|
pub flair_part_type: String,
|
||||||
pub value: String,
|
pub value: String,
|
||||||
@ -96,12 +98,14 @@ impl FlairPart {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
pub struct Author {
|
pub struct Author {
|
||||||
pub name: String,
|
pub name: String,
|
||||||
pub flair: Flair,
|
pub flair: Flair,
|
||||||
pub distinguished: String,
|
pub distinguished: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
pub struct Poll {
|
pub struct Poll {
|
||||||
pub poll_options: Vec<PollOption>,
|
pub poll_options: Vec<PollOption>,
|
||||||
pub voting_end_timestamp: (String, String),
|
pub voting_end_timestamp: (String, String),
|
||||||
@ -129,6 +133,7 @@ impl Poll {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
pub struct PollOption {
|
pub struct PollOption {
|
||||||
pub id: u64,
|
pub id: u64,
|
||||||
pub text: String,
|
pub text: String,
|
||||||
@ -158,13 +163,14 @@ impl PollOption {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Post flags with nsfw and stickied
|
// Post flags with nsfw and stickied
|
||||||
|
#[derive(Serialize)]
|
||||||
pub struct Flags {
|
pub struct Flags {
|
||||||
pub spoiler: bool,
|
pub spoiler: bool,
|
||||||
pub nsfw: bool,
|
pub nsfw: bool,
|
||||||
pub stickied: bool,
|
pub stickied: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug)]
|
#[derive(Debug, Serialize)]
|
||||||
pub struct Media {
|
pub struct Media {
|
||||||
pub url: String,
|
pub url: String,
|
||||||
pub alt_url: String,
|
pub alt_url: String,
|
||||||
@ -264,6 +270,7 @@ impl Media {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
pub struct GalleryMedia {
|
pub struct GalleryMedia {
|
||||||
pub url: String,
|
pub url: String,
|
||||||
pub width: i64,
|
pub width: i64,
|
||||||
@ -304,6 +311,7 @@ impl GalleryMedia {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Post containing content, metadata and media
|
// Post containing content, metadata and media
|
||||||
|
#[derive(Serialize)]
|
||||||
pub struct Post {
|
pub struct Post {
|
||||||
pub id: String,
|
pub id: String,
|
||||||
pub title: String,
|
pub title: String,
|
||||||
@ -470,7 +478,7 @@ pub struct Comment {
|
|||||||
pub prefs: Preferences,
|
pub prefs: Preferences,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Default, Clone)]
|
#[derive(Default, Clone, Serialize)]
|
||||||
pub struct Award {
|
pub struct Award {
|
||||||
pub name: String,
|
pub name: String,
|
||||||
pub icon_url: String,
|
pub icon_url: String,
|
||||||
@ -484,6 +492,7 @@ impl std::fmt::Display for Award {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
pub struct Awards(pub Vec<Award>);
|
pub struct Awards(pub Vec<Award>);
|
||||||
|
|
||||||
impl std::ops::Deref for Awards {
|
impl std::ops::Deref for Awards {
|
||||||
|
Loading…
Reference in New Issue
Block a user