From a807002ddff6398321c5bc6ee576f10791cc2bab Mon Sep 17 00:00:00 2001 From: Butter Cat Date: Wed, 25 Sep 2024 13:36:23 -0400 Subject: [PATCH] Fix #206 and make (most) emotes embed in comments (#209) * Fix links not being converted when multiple emojis are in one comment * Make (most) emotes embed within comments * Restore the behavior that the "rewrite_urls_removes_backslashes_and_rewrites_url" test looks for * Listen to cargo fmt and cargo clippy's suggestions as well as removing some leftover comments and code --------- Co-authored-by: Matthew Esposito --- Cargo.lock | 60 +++++++++++++++++++++++++++++ Cargo.toml | 1 + src/main.rs | 3 ++ src/post.rs | 4 +- src/utils.rs | 105 ++++++++++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 165 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index df122ef..5313874 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -888,6 +888,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "inventory" +version = "0.3.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f958d3d68f4167080a18141e10381e7634563984a537f2a49a30fd8e53ac5767" + [[package]] name = "is-terminal" version = "0.4.12" @@ -1255,6 +1261,7 @@ dependencies = [ "sealed_test", "serde", "serde_json", + "serde_json_path", "serde_yaml", "time", "tokio", @@ -1565,6 +1572,59 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_json_path" +version = "0.6.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bc0207b6351893eafa1e39aa9aea452abb6425ca7b02dd64faf29109e7a33ba" +dependencies = [ + "inventory", + "nom", + "once_cell", + "regex", + "serde", + "serde_json", + "serde_json_path_core", + "serde_json_path_macros", + "thiserror", +] + +[[package]] +name = "serde_json_path_core" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3d64fe53ce1aaa31bea2b2b46d3b6ab6a37e61854bedcbd9f174e188f3f7d79" +dependencies = [ + "inventory", + "once_cell", + "serde", + "serde_json", + "thiserror", +] + +[[package]] +name = "serde_json_path_macros" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a31e8177a443fd3e94917f12946ae7891dfb656e6d4c5e79b8c5d202fbcb723" +dependencies = [ + "inventory", + "once_cell", + "serde_json_path_core", + "serde_json_path_macros_internal", +] + +[[package]] +name = "serde_json_path_macros_internal" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75dde5a1d2ed78dfc411fc45592f72d3694436524d3353683ecb3d22009731dc" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.68", +] + [[package]] name = "serde_spanned" version = "0.6.6" diff --git a/Cargo.toml b/Cargo.toml index c6f8ac7..c29ce1f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,6 +44,7 @@ pretty_env_logger = "0.5.0" dotenvy = "0.15.7" rss = "2.0.7" arc-swap = "1.7.1" +serde_json_path = "0.6.7" async-recursion = "1.1.1" diff --git a/src/main.rs b/src/main.rs index 515a2a8..61f810e 100644 --- a/src/main.rs +++ b/src/main.rs @@ -239,6 +239,9 @@ async fn main() { app.at("/img/*path").get(|r| proxy(r, "https://i.redd.it/{path}").boxed()); app.at("/thumb/:point/:id").get(|r| proxy(r, "https://{point}.thumbs.redditmedia.com/{id}").boxed()); app.at("/emoji/:id/:name").get(|r| proxy(r, "https://emoji.redditmedia.com/{id}/{name}").boxed()); + app + .at("/emote/:subreddit_id/:filename") + .get(|r| proxy(r, "https://reddit-econ-prod-assets-permanent.s3.amazonaws.com/asset-manager/{subreddit_id}/{filename}").boxed()); app .at("/preview/:loc/award_images/:fullname/:id") .get(|r| proxy(r, "https://{loc}view.redd.it/award_images/{fullname}/{id}").boxed()); diff --git a/src/post.rs b/src/post.rs index e9aa820..2642d24 100644 --- a/src/post.rs +++ b/src/post.rs @@ -4,7 +4,7 @@ use crate::config::get_setting; use crate::server::RequestExt; use crate::subreddit::{can_access_quarantine, quarantine}; use crate::utils::{ - error, format_num, get_filters, nsfw_landing, param, parse_post, rewrite_urls, setting, template, time, val, Author, Awards, Comment, Flair, FlairPart, Post, Preferences, + error, format_num, get_filters, nsfw_landing, param, parse_post, rewrite_emotes, setting, template, time, val, Author, Awards, Comment, Flair, FlairPart, Post, Preferences, }; use hyper::{Body, Request, Response}; @@ -178,7 +178,7 @@ fn build_comment( get_setting("REDLIB_PUSHSHIFT_FRONTEND").unwrap_or_else(|| String::from(crate::config::DEFAULT_PUSHSHIFT_FRONTEND)), ) } else { - rewrite_urls(&val(comment, "body_html")) + rewrite_emotes(&data["media_metadata"], val(comment, "body_html")) }; let kind = comment["kind"].as_str().unwrap_or_default().to_string(); diff --git a/src/utils.rs b/src/utils.rs index ff968f8..987565f 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -12,6 +12,7 @@ use once_cell::sync::Lazy; use regex::Regex; use rust_embed::RustEmbed; use serde_json::Value; +use serde_json_path::{JsonPath, JsonPathExt}; use std::collections::{HashMap, HashSet}; use std::env; use std::str::FromStr; @@ -919,12 +920,19 @@ pub fn rewrite_urls(input_text: &str) -> String { // Rewrite Reddit links to Redlib REDDIT_REGEX.replace_all(input_text, r#"href="/"#) .to_string(); - text1 = REDDIT_EMOJI_REGEX - .replace_all(&text1, format_url(REDDIT_EMOJI_REGEX.find(&text1).map(|x| x.as_str()).unwrap_or_default())) - .to_string() - // Remove (html-encoded) "\" from URLs. - .replace("%5C", "") - .replace("\\_", "_"); + + loop { + if REDDIT_EMOJI_REGEX.find(&text1).is_none() { + break; + } else { + text1 = REDDIT_EMOJI_REGEX + .replace_all(&text1, format_url(REDDIT_EMOJI_REGEX.find(&text1).map(|x| x.as_str()).unwrap_or_default())) + .to_string() + } + } + + // Remove (html-encoded) "\" from URLs. + text1 = text1.replace("%5C", "").replace("\\_", "_"); // Rewrite external media previews to Redlib loop { @@ -980,6 +988,83 @@ pub fn rewrite_urls(input_text: &str) -> String { } } +// These links all follow a pattern of "https://reddit-econ-prod-assets-permanent.s3.amazonaws.com/asset-manager/SUBREDDIT_ID/RANDOM_FILENAME.png" +static REDDIT_EMOTE_LINK_REGEX: Lazy = Lazy::new(|| Regex::new(r#"https://reddit-econ-prod-assets-permanent.s3.amazonaws.com/asset-manager/(.*)"#).unwrap()); + +// These all follow a pattern of '"emote|SUBREDDIT_IT|NUMBER"', we want the number +static REDDIT_EMOTE_ID_NUMBER_REGEX: Lazy = Lazy::new(|| Regex::new(r#""emote\|.*\|(.*)""#).unwrap()); + +pub fn rewrite_emotes(media_metadata: &Value, comment: String) -> String { + /* Create the paths we'll use to look for our data inside the json. + Because we don't know the name of any given emote we use a wildcard to parse them. */ + let link_path = JsonPath::parse("$[*].s.u").expect("valid JSON Path"); + let id_path = JsonPath::parse("$[*].id").expect("valid JSON Path"); + let size_path = JsonPath::parse("$[*].s.y").expect("valid JSON Path"); + + // Extract all of the results from those json paths + let link_nodes = media_metadata.json_path(&link_path); + let id_nodes = media_metadata.json_path(&id_path); + + // Initialize our vectors + let mut id_vec = Vec::new(); + let mut link_vec = Vec::new(); + + // Add the relevant data to each of our vectors so we can access it by number later + for current_id in id_nodes { + id_vec.push(current_id) + } + for current_link in link_nodes { + link_vec.push(current_link) + } + + /* Set index to the length of link_vec. + This is one larger than we'll actually be looking at, but we correct that later */ + let mut index = link_vec.len(); + + // Comment needs to be in scope for when we call rewrite_urls() + let mut comment = comment; + + /* Loop until index hits zero. + This also prevents us from trying to do anything on an empty vector */ + while index != 0 { + /* Subtract 1 from index to get the real index we should be looking at. + Then continue on each subsequent loop to continue until we hit the last entry in the vector. + This is how we get this to deal with multiple emotes in a single message and properly replace each ID with it's link */ + index -= 1; + + // Convert our current index in id_vec into a string so we can search through it with regex + let current_id = id_vec[index].to_string(); + + /* The ID number can be multiple lengths, so we capture it with regex. + We also want to only attempt anything when we get matches to avoid panicking */ + if let Some(id_capture) = REDDIT_EMOTE_ID_NUMBER_REGEX.captures(¤t_id) { + // Format the ID to include the colons it has in the comment text + let id = format!(":{}:", &id_capture[1]); + + // Convert current link to string to search through it with the regex + let link = link_vec[index].to_string(); + + // Make sure we only do operations when we get matches, otherwise we panic when trying to access the first match + if let Some(link_capture) = REDDIT_EMOTE_LINK_REGEX.captures(&link) { + /* Reddit sends a size for the image based on whether it's alone or accompanied by text. + It's a good idea and makes everything look nicer, so we'll do the same. */ + let size = media_metadata.json_path(&size_path).first().unwrap().to_string(); + + // Replace the ID we found earlier in the comment with the respective image and it's link from the regex capture + let to_replace_with = format!( + "", + &link_capture[1] + ); + + // Inside the comment replace the ID we found with the string that will embed the image + comment = comment.replace(&id, &to_replace_with).to_string(); + } + } + } + // Call rewrite_urls() to transform any other Reddit links + rewrite_urls(&comment) +} + // Format vote count to a string that will be displayed. // Append `m` and `k` for millions and thousands respectively, and // round to the nearest tenth. @@ -1301,3 +1386,11 @@ fn test_url_path_basename() { // empty path assert_eq!(url_path_basename("/"), ""); } + +#[test] +fn test_rewriting_emotes() { + let json_input = serde_json::from_str(r#"{"emote|t5_31hpy|2028":{"e":"Image","id":"emote|t5_31hpy|2028","m":"image/png","s":{"u":"https://reddit-econ-prod-assets-permanent.s3.amazonaws.com/asset-manager/t5_31hpy/PW6WsOaLcd.png","x":60,"y":60},"status":"valid","t":"sticker"}}"#).expect("Valid JSON"); + let comment_input = r#"

:2028:

"#; + let output = r#"

"#; + assert_eq!(rewrite_emotes(&json_input, comment_input.to_string()), output); +}