fix annoying edge case shit and reduce json file size a bit

deal with posts containing "$$$" and handle submissions without a name field too
fix sql building for gallery posts that are deleted and galleries that have images that are status: failed
2025-01-23 10:21:05 +13:00 · 2025-01-22 11:07:04 +13:00 · 2025-01-21 21:08:26 +13:00
5 changed files with 132 additions and 25 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,6 @@
 dummy/
 node_modules/
-*.sql
+*.sql*
 ok
 src/test.js
-*-threads.json
+*-threads.json*
--- a/README.md
+++ b/README.md
@ -1,6 +1,8 @@
 # reddit-lemmy-importer
 turn json files downloaded from https://the-eye.eu/redarcs/ into lemmy comms :D

+*Note: these archives are only 12/31/2022 and before*
+
 this is effectively https://github.com/mesmere/RedditLemmyImporter but in js and for a different type of archive

 the posts/comments dump is read as a stream so handling bigger subreddits is less ram-intensive (though the final tree will still take up a good amount of ram so maybe create a big swapfile if processing large subreddits)
@ -9,6 +11,8 @@ the posts/comments dump is read as a stream so handling bigger subreddits is les

 You can build the SQL script before making the comm/user though.

+
+
 ## usage:
 install dependencies

@ -33,7 +37,7 @@ option 3: import your sql to a docker container
 ## TODO:
 - set URL embed titles/descriptions and url_content type and embed_video_url in posts

- FIX ap_id!!!!!
+- FIX ap_id!!!!! (may not be needed, see lemmy/src/code_migrations.rs function post_updates_2020_04_03 and comment_updates_2020_04_03)

 - - this could be done by taking the federated url as an argument then updating the ap_id using [the url + /type/ + sql id from the post]

--- a/src/index.js
+++ b/src/index.js
@ -63,28 +63,36 @@ async function unflatten(postsFile, commentsFile) {
        //console.log('Thread Structure:');
        //printThreadStructure(result);
        
-        const subredditName = result[0].subreddit_name_prefixed.slice(2);
-        const resultOutput = `${subredditName}-threads.json`;
+        const resultOutput = `${args.comm}-threads.json`;
+        const subredditName = result[0].subreddit
+        const sqlOutputPath = args.output?.trim() ? args.output : `${subredditName}.sql`;

        // Write the result to a file
        writeFileSync(resultOutput, JSON.stringify(result, null, 2));

+        // console.log(result[357])
        result = {};
        
        // read the threads through a stream
        const pipeline = createReadStream(resultOutput).pipe(streamArray.withParser());
        
        // empty the sql file if it exists
-        existsSync(args.output) ? writeFileSync(args.output, '') : null
-        // write the sql to the user specified path, or use [subredditname].sql if none is chosen
-        const sqlOutput = createWriteStream(args.output?.trim() ? args.output : `${subredditName}.sql`, {flags: "a"});
+        existsSync(sqlOutputPath) ? writeFileSync(sqlOutputPath, '') : null
+        // write the sql to the path
+        const sqlOutput = createWriteStream(sqlOutputPath, {flags: "a"});

        var threadCounter = 0;

        // create an sql query to make a lemmy thread for each json object
        pipeline.on('data', (thread) => {
+          try{
          sqlOutput.write(writeSql(thread.value, args.comm, args.user));
          threadCounter++;
+          //console.log(threadCounter)
+          //threadCounter == 467 ? console.log( `${threadCounter}  ${thread}`) : null
+          } catch (error) {
+            console.error(`Error processing post ${thread.value.name}:`, error);
+          }
        });

        // close the stream and say how many threads are processed and where its saved
--- a/src/parser.js
+++ b/src/parser.js
@ -58,9 +58,10 @@ async function processPostsAndComments(postsFile, commentsFile) {
    const context = new ProcessingContext();

    // allowed fields
-    // reduced the size of my dummy json to about 31% of its size without the filters
-    const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "edited", "gallery_data", "is_gallery", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit_name_prefixed", "title", "url"];
-    const commentKeysAllowed = ["author", "author_flair_text", "body", "created_utc", "link_id", "name", "parent_id", "score", "subreddit_name_prefixed"];
+    // reduced the size of my dummy json to about 22% of its original size without the filters
+    // also makes writing sql a couple seconds faster since its reading less bullshit from the disk
+    const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "crosspost_parent_list", "edited", "gallery_data", "is_gallery", "is_self", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit", "title", "url"];
+    const commentKeysAllowed = ["author", "author_flair_text", "body", "created_utc", "link_id", "name", "parent_id", "score", "subreddit"];
    
    // Process posts first
    const postsStream = createInterface({
@ -68,10 +69,31 @@ async function processPostsAndComments(postsFile, commentsFile) {
        crlfDelay: Infinity
    });

+    //var dbgpost = 0;
    for await (const line of postsStream) {
        if (line.trim()) {
-            const post = filterJsonKeys(JSON.parse(line), submissionKeysAllowed);
+            // i think this is only a problem for the comments (see below) but i did it here too as a safety measure
+            var post = JSON.parse(line)
+            if(!post.name){
+                post.name = `t3_${post.id}`;
+            }
+
+            post = filterJsonKeys(post, submissionKeysAllowed);
+
+            // clear unused fields + large array of unused thumbnails (best quality is used instead)
+            if(post?.gallery_data?.items) { post.gallery_data.items.forEach(item => {
+                item?.id ? delete item.id : null;
+                if(post?.media_metadata?.[item?.media_id]) {
+                    delete post.media_metadata[item.media_id].m;
+                    delete post.media_metadata[item.media_id].p;
+                }
+            })}
+
+            // reduce crosspost size too
+            post.crosspost_parent_list ? post.crosspost_parent_list.forEach(crosspost =>  crosspost = filterJsonKeys(post, submissionKeysAllowed)) : null;
+
            context.processItem(post);
+            //dbgpost++;
        }
    }

@ -83,7 +105,14 @@ async function processPostsAndComments(postsFile, commentsFile) {

    for await (const line of commentsStream) {
        if (line.trim()) {
-            const comment = filterJsonKeys(JSON.parse(line), commentKeysAllowed);
+            // dont filter yet so that we can have the id key
+            var comment = JSON.parse(line)
+            // if its a comment with no "name" then make a "name" field
+            if(!comment.name){
+                comment.name = `t1_${comment.id}`;
+            }
+            
+            comment = filterJsonKeys(comment, commentKeysAllowed);
            context.processItem(comment);
        }
    }
--- a/src/sql.js
+++ b/src/sql.js
@ -17,8 +17,10 @@ function lit(str) {
 } */

 // decodeHTML then replace all instances of ' with ''
+// then escape $ as \$, since saying "$$$" (like money) will close the "DO $$"" statement :(
+
 function lit(str) {
-  return typeof str === 'string' ? decodeHTML(str).replace(/'/g, "''") : 'null'
+  return typeof str === 'string' ? decodeHTML(str).replace(/'/g, "''").replace(/\$/g, "\\$") : 'null'
 }

 // Decode HTML entities (e.g., '&gt;' -> '>')
@ -52,9 +54,32 @@ function mkTitle(post) {
 }

 // wrap the url in singlequotes HERE and not in the query like '${lit(mkUrl(post))}'
-// this is because the null type in postgres will be turned into a string which will break lemmy until you remove the row or set it to null manually  
+// this is because the null type in postgres will be turned into a string which will break lemmy until you remove the row or set it to null manually
+
+// 1 line ternary operator gets obliterated thanks to 15,000,000 different edge cases
 function mkUrl(post) {
-  return post.is_gallery ? `'${getSubmissionImages(post)[0]}'` : post.is_self ? 'null' : `'${post.url}'`
+  if(post.is_gallery){
+    // deleted galleries are "url": null and not 'null' in text
+    // [0][0] means the first images url
+    return post.url == null ? 'null' : `'${getSubmissionImages(post)[0][0]}'`;
+  } else if(post.is_self) {
+    return 'null'
+  } else if(post.crosspost_parent_list) {
+    // crosspost urls are just paths "/r/subreddit/id/...", so get the full url from the original post
+    // deleted crossposts have an empty array
+    if(post.crosspost_parent_list.length > 0) {
+      if(post.is_gallery){
+        return `'${getSubmissionImages(post.crosspost_parent_list[0])[0]}'`;
+      } else {
+        return `'${post.crosspost_parent_list[0].url}'`
+      }
+    } else {
+      return 'null'
+    }
+  } else {
+    return post.url == null || post.url == '' ? 'null' : `'${post.url}'`;
+  }
+  //return post.is_gallery ? `'${getSubmissionImages(post)[0]}'` : post.is_self ? 'null' : post.crosspost_parent_list ? `'${post.crosspost_parent_list[0].url}'` : `'${post.url}'`
 }

 /** 
@ -74,7 +99,8 @@ function mkBody(post, postType) {
  outputBody += post.author_flair_text && post.author_flair_text.trim() ? ` - ${post.author_flair_text}` : ''

  // add original subreddit
-  outputBody += ` - originally from /${post.subreddit_name_prefixed}\``;
+  // changed from post.subreddit_name_prefixed to post.subreddit since the former isnt in all posts
+  outputBody += ` - originally from /r/${post.subreddit}\``;

  // then add the post body if it has one.
  // comments use a different field for the body (.body and not .selftext)
@ -83,15 +109,30 @@ function mkBody(post, postType) {
      // double newlines for reddit spacing is done at the START of the next addition, this way there arent trailing newlines if theres nothing else after. 
      outputBody += [...post.title].length > 200 ? `\n\n\`Full title: ${post.title}\`` : ''

+      // i want to scream
+      if (post.crosspost_parent_list && post.crosspost_parent_list.length > 0) {
+        var crosspost = post.crosspost_parent_list[0];
+        outputBody += `\n\nCrosspost:`
+        
+        outputBody += `\n\n${mkBody(crosspost, "post")}`
+        return outputBody;
+      }
+
      var postImages = getSubmissionImages(post);
      // add "Gallery links:" then all the image links as bullet points if the post is a gallery
-      outputBody += postImages.length > 1 ? `\n\nGallery links:\n- ${post.url}${postImages.map(image => `\n\n- ${image}`).join('')}` : ''
+      outputBody += postImages.length > 1 ? `\n\nGallery links:\n- ${post.url}${postImages.map(image => {
+        if(image.length > 1) {
+          // >1 means the image has a caption
+          return `\n\n- ${image[0]} - "${image[1]}"` 
+        }
+        return `\n\n- ${image[0]}`
+      }).join('')}` : ''

      // only if selftext exists, it wont exist if its an image post
      outputBody += post.selftext && post.selftext.trim() ? `\n\n` + post.selftext : ''
  } else if (postType == "comment") {
      outputBody += post.body && post.body.trim() ? `\n\n` + post.body : ''
-  } else console.log("YOU FUCKED UP THE MKBODY CALL")
+  } else console.error("YOU FUCKED UP THE MKBODY CALL")

  return outputBody;
 }
@ -102,19 +143,45 @@ function mkBody(post, postType) {
 */
 function getSubmissionImages(post) {
  // is_gallery is when there is >1 image (shocker)
-  if (post.is_gallery) {
+  // deleted gallery posts have their gallery_data and media_metadata erased too
+  if (post.is_gallery && post.gallery_data != null && post.media_metadata != null) {
      var gallery_collection = [];
      // iterate through JSON keys of gallery_data.
      // we need to get the keys/IDs from gallery_data since its items are in the proper order that the gallery is,
      // media_metadata sorts the images by alphanumerical order, and we want to preserve the proper order of the gallery
      // still, we use the keys we get from gallery_data to get the links from media_metadata
-      post.gallery_data.items.forEach(image => {
+      post.gallery_data.items.forEach(fileIDs => {
          // index media_metadata using the "media_id" of the current item we have iterated over in gallery_data
          // in the item we have indexed:
          // s = data for the best quality version of the image
          // s.u = the url of it
          // so we get s.u of the current key and push it to the array
-          gallery_collection.push(decodeHTML(post.media_metadata[image.media_id].s.u));
+          // if the file is an "AnimatedImage", then the link for the highest quality is in the "gif"/"mp4" field
+          var file = post.media_metadata[fileIDs.media_id];
+          var item = [];
+
+          if (file.status == "valid") {
+            if (file.e == "Image"){
+              item.push(decodeHTML(file.s.u))
+            } else if (file.e == "AnimatedImage") {
+              item.push(decodeHTML(file.s.gif))
+            } else {
+              console.error(`post ${post.name} image ${JSON.stringify(fileIDs.media_id)} in media_metadata has an unknown filetype: ${file.e}`);
+            }
+
+            // if there was a known image type
+            if (item.length>0) {
+              // add caption too if it exists
+              if(fileIDs.caption) {
+                item.push(fileIDs.caption)
+              }
+
+              gallery_collection.push(item);
+            }
+
+          } /*else if (file.status == "failed") {
+            
+          }*/   
      });
      return gallery_collection;
  } else {
@ -123,7 +190,6 @@ function getSubmissionImages(post) {
  }
 }

-
 // Ltrees ("path" column in the "comment" table) in lemmy: 0.comment.reply.[...].reply
 // where comment and reply are ids in the comment table

@ -209,6 +275,7 @@ function writeSql(post, targetCommName, targetUserName) {
  // post_aggregates is better to change anyway since its an INT8 and not an INT2, which means the score doesnt cap out at 2¹⁵-1, perhaps needing multiple entries for super popular posts
  // we use an UPDATE statement since there is already a row that gets made when we inserted the post prior
  // note: hovering over the upvote count will show the sad truth :(
+  
  query += `UPDATE post_aggregates SET score=${post.score} WHERE post_id=root_post_id;\n`

  // Traverse again but INSERT this time (this could probably be a function)
@ -220,7 +287,6 @@ function writeSql(post, targetCommName, targetUserName) {
      const current = stack[index]; // Get the current item to process
      index++; // Move the index forward
  
-      console.log("bye")
      // Insert the current comment
      query += 
      `INSERT INTO comment (creator_id, post_id, content, published, updated) ` +
@ -249,4 +315,4 @@ function writeSql(post, targetCommName, targetUserName) {
  return `DO $$ ${query} $$;\n\n`
 }

-export { writeSql }
+export { writeSql, mkUrl }
Author	SHA1	Message	Date
starlight	7209e75c6b	fix annoying edge case shit and reduce json file size a bit	2025-01-23 10:21:05 +13:00
starlight	3925c2f8ed	deal with posts containing "$$$" and handle submissions without a name field too	2025-01-22 11:07:04 +13:00
starlight	c76e3e7cbb	fix sql building for gallery posts that are deleted and galleries that have images that are status: failed	2025-01-21 21:08:26 +13:00