From 7209e75c6bdaddb5c4cb033da3305d864790e62b Mon Sep 17 00:00:00 2001 From: starlight Date: Thu, 23 Jan 2025 10:21:05 +1300 Subject: [PATCH] fix annoying edge case shit and reduce json file size a bit --- .gitignore | 4 ++-- README.md | 6 +++++- src/index.js | 2 +- src/parser.js | 18 ++++++++++++++-- src/sql.js | 60 +++++++++++++++++++++++++++++++++++++++++++++------ 5 files changed, 78 insertions(+), 12 deletions(-) diff --git a/.gitignore b/.gitignore index 3d4e9ab..7324288 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ dummy/ node_modules/ -*.sql +*.sql* ok src/test.js -*-threads.json \ No newline at end of file +*-threads.json* \ No newline at end of file diff --git a/README.md b/README.md index da55cde..2f316a9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # reddit-lemmy-importer turn json files downloaded from https://the-eye.eu/redarcs/ into lemmy comms :D +*Note: these archives are only 12/31/2022 and before* + this is effectively https://github.com/mesmere/RedditLemmyImporter but in js and for a different type of archive the posts/comments dump is read as a stream so handling bigger subreddits is less ram-intensive (though the final tree will still take up a good amount of ram so maybe create a big swapfile if processing large subreddits) @@ -9,6 +11,8 @@ the posts/comments dump is read as a stream so handling bigger subreddits is les You can build the SQL script before making the comm/user though. + + ## usage: install dependencies @@ -33,7 +37,7 @@ option 3: import your sql to a docker container ## TODO: - set URL embed titles/descriptions and url_content type and embed_video_url in posts -- FIX ap_id!!!!! +- FIX ap_id!!!!! (may not be needed, see lemmy/src/code_migrations.rs function post_updates_2020_04_03 and comment_updates_2020_04_03) - - this could be done by taking the federated url as an argument then updating the ap_id using [the url + /type/ + sql id from the post] diff --git a/src/index.js b/src/index.js index 18fa691..1d01a67 100644 --- a/src/index.js +++ b/src/index.js @@ -91,7 +91,7 @@ async function unflatten(postsFile, commentsFile) { //console.log(threadCounter) //threadCounter == 467 ? console.log( `${threadCounter} ${thread}`) : null } catch (error) { - console.error('Error processing post:', error); + console.error(`Error processing post ${thread.value.name}:`, error); } }); diff --git a/src/parser.js b/src/parser.js index 9f46551..fef46b4 100644 --- a/src/parser.js +++ b/src/parser.js @@ -58,8 +58,9 @@ async function processPostsAndComments(postsFile, commentsFile) { const context = new ProcessingContext(); // allowed fields - // reduced the size of my dummy json to about 31% of its size without the filters - const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "edited", "gallery_data", "is_gallery", "is_self", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit", "title", "url"]; + // reduced the size of my dummy json to about 22% of its original size without the filters + // also makes writing sql a couple seconds faster since its reading less bullshit from the disk + const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "crosspost_parent_list", "edited", "gallery_data", "is_gallery", "is_self", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit", "title", "url"]; const commentKeysAllowed = ["author", "author_flair_text", "body", "created_utc", "link_id", "name", "parent_id", "score", "subreddit"]; // Process posts first @@ -78,6 +79,19 @@ async function processPostsAndComments(postsFile, commentsFile) { } post = filterJsonKeys(post, submissionKeysAllowed); + + // clear unused fields + large array of unused thumbnails (best quality is used instead) + if(post?.gallery_data?.items) { post.gallery_data.items.forEach(item => { + item?.id ? delete item.id : null; + if(post?.media_metadata?.[item?.media_id]) { + delete post.media_metadata[item.media_id].m; + delete post.media_metadata[item.media_id].p; + } + })} + + // reduce crosspost size too + post.crosspost_parent_list ? post.crosspost_parent_list.forEach(crosspost => crosspost = filterJsonKeys(post, submissionKeysAllowed)) : null; + context.processItem(post); //dbgpost++; } diff --git a/src/sql.js b/src/sql.js index e70aa65..eaf06f9 100644 --- a/src/sql.js +++ b/src/sql.js @@ -55,8 +55,31 @@ function mkTitle(post) { // wrap the url in singlequotes HERE and not in the query like '${lit(mkUrl(post))}' // this is because the null type in postgres will be turned into a string which will break lemmy until you remove the row or set it to null manually + +// 1 line ternary operator gets obliterated thanks to 15,000,000 different edge cases function mkUrl(post) { - return post.is_gallery ? `'${getSubmissionImages(post)[0]}'` : post.is_self ? 'null' : `'${post.url}'` + if(post.is_gallery){ + // deleted galleries are "url": null and not 'null' in text + // [0][0] means the first images url + return post.url == null ? 'null' : `'${getSubmissionImages(post)[0][0]}'`; + } else if(post.is_self) { + return 'null' + } else if(post.crosspost_parent_list) { + // crosspost urls are just paths "/r/subreddit/id/...", so get the full url from the original post + // deleted crossposts have an empty array + if(post.crosspost_parent_list.length > 0) { + if(post.is_gallery){ + return `'${getSubmissionImages(post.crosspost_parent_list[0])[0]}'`; + } else { + return `'${post.crosspost_parent_list[0].url}'` + } + } else { + return 'null' + } + } else { + return post.url == null || post.url == '' ? 'null' : `'${post.url}'`; + } + //return post.is_gallery ? `'${getSubmissionImages(post)[0]}'` : post.is_self ? 'null' : post.crosspost_parent_list ? `'${post.crosspost_parent_list[0].url}'` : `'${post.url}'` } /** @@ -86,9 +109,24 @@ function mkBody(post, postType) { // double newlines for reddit spacing is done at the START of the next addition, this way there arent trailing newlines if theres nothing else after. outputBody += [...post.title].length > 200 ? `\n\n\`Full title: ${post.title}\`` : '' + // i want to scream + if (post.crosspost_parent_list && post.crosspost_parent_list.length > 0) { + var crosspost = post.crosspost_parent_list[0]; + outputBody += `\n\nCrosspost:` + + outputBody += `\n\n${mkBody(crosspost, "post")}` + return outputBody; + } + var postImages = getSubmissionImages(post); // add "Gallery links:" then all the image links as bullet points if the post is a gallery - outputBody += postImages.length > 1 ? `\n\nGallery links:\n- ${post.url}${postImages.map(image => `\n\n- ${image}`).join('')}` : '' + outputBody += postImages.length > 1 ? `\n\nGallery links:\n- ${post.url}${postImages.map(image => { + if(image.length > 1) { + // >1 means the image has a caption + return `\n\n- ${image[0]} - "${image[1]}"` + } + return `\n\n- ${image[0]}` + }).join('')}` : '' // only if selftext exists, it wont exist if its an image post outputBody += post.selftext && post.selftext.trim() ? `\n\n` + post.selftext : '' @@ -120,16 +158,27 @@ function getSubmissionImages(post) { // so we get s.u of the current key and push it to the array // if the file is an "AnimatedImage", then the link for the highest quality is in the "gif"/"mp4" field var file = post.media_metadata[fileIDs.media_id]; + var item = []; if (file.status == "valid") { if (file.e == "Image"){ - gallery_collection.push(decodeHTML(file.s.u)) + item.push(decodeHTML(file.s.u)) } else if (file.e == "AnimatedImage") { - gallery_collection.push(decodeHTML(file.s.gif)) + item.push(decodeHTML(file.s.gif)) } else { console.error(`post ${post.name} image ${JSON.stringify(fileIDs.media_id)} in media_metadata has an unknown filetype: ${file.e}`); } + // if there was a known image type + if (item.length>0) { + // add caption too if it exists + if(fileIDs.caption) { + item.push(fileIDs.caption) + } + + gallery_collection.push(item); + } + } /*else if (file.status == "failed") { }*/ @@ -141,7 +190,6 @@ function getSubmissionImages(post) { } } - // Ltrees ("path" column in the "comment" table) in lemmy: 0.comment.reply.[...].reply // where comment and reply are ids in the comment table @@ -267,4 +315,4 @@ function writeSql(post, targetCommName, targetUserName) { return `DO $$ ${query} $$;\n\n` } -export { writeSql } \ No newline at end of file +export { writeSql, mkUrl } \ No newline at end of file