From 7209e75c6bdaddb5c4cb033da3305d864790e62b Mon Sep 17 00:00:00 2001
From: starlight <starlight@kitty.community>
Date: Thu, 23 Jan 2025 10:21:05 +1300
Subject: [PATCH] fix annoying edge case shit and reduce json file size a bit

---
 .gitignore    |  4 ++--
 README.md     |  6 +++++-
 src/index.js  |  2 +-
 src/parser.js | 18 ++++++++++++++--
 src/sql.js    | 60 +++++++++++++++++++++++++++++++++++++++++++++------
 5 files changed, 78 insertions(+), 12 deletions(-)

diff --git a/.gitignore b/.gitignore
index 3d4e9ab..7324288 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,6 @@
 dummy/
 node_modules/
-*.sql
+*.sql*
 ok
 src/test.js
-*-threads.json
\ No newline at end of file
+*-threads.json*
\ No newline at end of file
diff --git a/README.md b/README.md
index da55cde..2f316a9 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
 # reddit-lemmy-importer
 turn json files downloaded from https://the-eye.eu/redarcs/ into lemmy comms :D
 
+*Note: these archives are only 12/31/2022 and before*
+
 this is effectively https://github.com/mesmere/RedditLemmyImporter but in js and for a different type of archive
 
 the posts/comments dump is read as a stream so handling bigger subreddits is less ram-intensive (though the final tree will still take up a good amount of ram so maybe create a big swapfile if processing large subreddits)
@@ -9,6 +11,8 @@ the posts/comments dump is read as a stream so handling bigger subreddits is les
 
 You can build the SQL script before making the comm/user though.
 
+
+
 ## usage:
 install dependencies
 
@@ -33,7 +37,7 @@ option 3: import your sql to a docker container
 ## TODO:
 - set URL embed titles/descriptions and url_content type and embed_video_url in posts
 
-- FIX ap_id!!!!!
+- FIX ap_id!!!!! (may not be needed, see lemmy/src/code_migrations.rs function post_updates_2020_04_03 and comment_updates_2020_04_03)
 
 - - this could be done by taking the federated url as an argument then updating the ap_id using [the url + /type/ + sql id from the post]
 
diff --git a/src/index.js b/src/index.js
index 18fa691..1d01a67 100644
--- a/src/index.js
+++ b/src/index.js
@@ -91,7 +91,7 @@ async function unflatten(postsFile, commentsFile) {
           //console.log(threadCounter)
           //threadCounter == 467 ? console.log( `${threadCounter}  ${thread}`) : null
           } catch (error) {
-            console.error('Error processing post:', error);
+            console.error(`Error processing post ${thread.value.name}:`, error);
           }
         });
 
diff --git a/src/parser.js b/src/parser.js
index 9f46551..fef46b4 100644
--- a/src/parser.js
+++ b/src/parser.js
@@ -58,8 +58,9 @@ async function processPostsAndComments(postsFile, commentsFile) {
     const context = new ProcessingContext();
 
     // allowed fields
-    // reduced the size of my dummy json to about 31% of its size without the filters
-    const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "edited", "gallery_data", "is_gallery", "is_self", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit", "title", "url"];
+    // reduced the size of my dummy json to about 22% of its original size without the filters
+    // also makes writing sql a couple seconds faster since its reading less bullshit from the disk
+    const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "crosspost_parent_list", "edited", "gallery_data", "is_gallery", "is_self", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit", "title", "url"];
     const commentKeysAllowed = ["author", "author_flair_text", "body", "created_utc", "link_id", "name", "parent_id", "score", "subreddit"];
     
     // Process posts first
@@ -78,6 +79,19 @@ async function processPostsAndComments(postsFile, commentsFile) {
             }
 
             post = filterJsonKeys(post, submissionKeysAllowed);
+
+            // clear unused fields + large array of unused thumbnails (best quality is used instead)
+            if(post?.gallery_data?.items) { post.gallery_data.items.forEach(item => {
+                item?.id ? delete item.id : null;
+                if(post?.media_metadata?.[item?.media_id]) {
+                    delete post.media_metadata[item.media_id].m;
+                    delete post.media_metadata[item.media_id].p;
+                }
+            })}
+
+            // reduce crosspost size too
+            post.crosspost_parent_list ? post.crosspost_parent_list.forEach(crosspost =>  crosspost = filterJsonKeys(post, submissionKeysAllowed)) : null;
+
             context.processItem(post);
             //dbgpost++;
         }
diff --git a/src/sql.js b/src/sql.js
index e70aa65..eaf06f9 100644
--- a/src/sql.js
+++ b/src/sql.js
@@ -55,8 +55,31 @@ function mkTitle(post) {
 
 // wrap the url in singlequotes HERE and not in the query like '${lit(mkUrl(post))}'
 // this is because the null type in postgres will be turned into a string which will break lemmy until you remove the row or set it to null manually
+
+// 1 line ternary operator gets obliterated thanks to 15,000,000 different edge cases
 function mkUrl(post) {
-  return post.is_gallery ? `'${getSubmissionImages(post)[0]}'` : post.is_self ? 'null' : `'${post.url}'`
+  if(post.is_gallery){
+    // deleted galleries are "url": null and not 'null' in text
+    // [0][0] means the first images url
+    return post.url == null ? 'null' : `'${getSubmissionImages(post)[0][0]}'`;
+  } else if(post.is_self) {
+    return 'null'
+  } else if(post.crosspost_parent_list) {
+    // crosspost urls are just paths "/r/subreddit/id/...", so get the full url from the original post
+    // deleted crossposts have an empty array
+    if(post.crosspost_parent_list.length > 0) {
+      if(post.is_gallery){
+        return `'${getSubmissionImages(post.crosspost_parent_list[0])[0]}'`;
+      } else {
+        return `'${post.crosspost_parent_list[0].url}'`
+      }
+    } else {
+      return 'null'
+    }
+  } else {
+    return post.url == null || post.url == '' ? 'null' : `'${post.url}'`;
+  }
+  //return post.is_gallery ? `'${getSubmissionImages(post)[0]}'` : post.is_self ? 'null' : post.crosspost_parent_list ? `'${post.crosspost_parent_list[0].url}'` : `'${post.url}'`
 }
 
 /** 
@@ -86,9 +109,24 @@ function mkBody(post, postType) {
       // double newlines for reddit spacing is done at the START of the next addition, this way there arent trailing newlines if theres nothing else after. 
       outputBody += [...post.title].length > 200 ? `\n\n\`Full title: ${post.title}\`` : ''
 
+      // i want to scream
+      if (post.crosspost_parent_list && post.crosspost_parent_list.length > 0) {
+        var crosspost = post.crosspost_parent_list[0];
+        outputBody += `\n\nCrosspost:`
+        
+        outputBody += `\n\n${mkBody(crosspost, "post")}`
+        return outputBody;
+      }
+
       var postImages = getSubmissionImages(post);
       // add "Gallery links:" then all the image links as bullet points if the post is a gallery
-      outputBody += postImages.length > 1 ? `\n\nGallery links:\n- ${post.url}${postImages.map(image => `\n\n- ${image}`).join('')}` : ''
+      outputBody += postImages.length > 1 ? `\n\nGallery links:\n- ${post.url}${postImages.map(image => {
+        if(image.length > 1) {
+          // >1 means the image has a caption
+          return `\n\n- ${image[0]} - "${image[1]}"` 
+        }
+        return `\n\n- ${image[0]}`
+      }).join('')}` : ''
 
       // only if selftext exists, it wont exist if its an image post
       outputBody += post.selftext && post.selftext.trim() ? `\n\n` + post.selftext : ''
@@ -120,16 +158,27 @@ function getSubmissionImages(post) {
           // so we get s.u of the current key and push it to the array
           // if the file is an "AnimatedImage", then the link for the highest quality is in the "gif"/"mp4" field
           var file = post.media_metadata[fileIDs.media_id];
+          var item = [];
 
           if (file.status == "valid") {
             if (file.e == "Image"){
-              gallery_collection.push(decodeHTML(file.s.u))
+              item.push(decodeHTML(file.s.u))
             } else if (file.e == "AnimatedImage") {
-              gallery_collection.push(decodeHTML(file.s.gif))
+              item.push(decodeHTML(file.s.gif))
             } else {
               console.error(`post ${post.name} image ${JSON.stringify(fileIDs.media_id)} in media_metadata has an unknown filetype: ${file.e}`);
             }
 
+            // if there was a known image type
+            if (item.length>0) {
+              // add caption too if it exists
+              if(fileIDs.caption) {
+                item.push(fileIDs.caption)
+              }
+
+              gallery_collection.push(item);
+            }
+
           } /*else if (file.status == "failed") {
             
           }*/   
@@ -141,7 +190,6 @@ function getSubmissionImages(post) {
   }
 }
 
-
 // Ltrees ("path" column in the "comment" table) in lemmy: 0.comment.reply.[...].reply
 // where comment and reply are ids in the comment table
 
@@ -267,4 +315,4 @@ function writeSql(post, targetCommName, targetUserName) {
   return `DO $$ ${query} $$;\n\n`
 }
 
-export { writeSql }
\ No newline at end of file
+export { writeSql, mkUrl }
\ No newline at end of file