finally

2025-01-18 04:44:26 +13:00
commit 191d4f5049
7 changed files with 601 additions and 0 deletions
--- a/src/index.js
+++ b/src/index.js
@ -0,0 +1,81 @@
+import { processPostsAndComments } from './parser.js';
+import { writeSql } from './sql.js';
+
+import { join } from "node:path";
+// es6 >:( 
+import path from 'path';
+import {
+  fileURLToPath
+} from 'url';
+
+import { exists, existsSync, writeFileSync, appendFileSync } from 'node:fs';
+
+export const __filename = fileURLToPath(
+  import.meta.url);
+export const __dirname = path.dirname(__filename);
+
+import yargs from 'yargs';
+
+// https://github.com/yargs/yargs/blob/main/docs/examples.md section "Yargs is here to help you..."
+var args = yargs(process.argv.slice(2))
+    .alias('c', 'comm')
+    .alias('u', 'user')
+    .alias('o', 'output')
+    .describe({
+        'c': 'Name of the community the archive will be added to',
+        'u': 'Name of the user the archived posts will be made by',
+        'o': 'Path that the .sql file will save to',
+        'posts': 'The JSON dump file of submissions you got from https://the-eye.eu/redarcs/',
+        'comments': 'The JSON dump file of comments',
+    })
+    .string(['comm', 'user', 'output', 'posts', 'comments'])
+    .nargs(['comm', 'user', 'output', 'posts', 'comments'], 1)
+    .demandOption(['comm', 'user', 'output', 'posts', 'comments'])
+    .help('h')
+    .alias('h', 'help')
+    .epilog("git: https://git.stardust.wtf/starlight/reddit-lemmy-importer")
+    .parse();
+
+
+/* // returns the trees filename
+processPostsAndComments(args.posts, args.comments, (result) => {
+  console.log(result)
+}); */
+
+function printThreadStructure(thread, level = 0) {
+    thread.forEach(item => {
+      var out = '';
+      out += '  '.repeat(level) + `${item.name} by ${item.author}: `
+      // reddit item type 3 = submission/post, else its a comment
+      out += item.name[1] == "3" ? `"${item.title}` : item.body == "" ? "\"\"" : `"${item.body}"`
+      console.log(out)
+        if (item.children.length > 0) {
+            printThreadStructure(item.children, level + 1);
+        }
+    });
+}
+
+async function unflatten(postsFile, commentsFile) {
+    try {
+        const result = await processPostsAndComments(postsFile, commentsFile);
+        //console.log('Thread Structure:');
+        //printThreadStructure(result);
+        
+        // Optional: write the result to a file
+        //writeFileSync('processed-threads.json', JSON.stringify(result, null, 2));
+
+        // empty the file if it exists
+        existsSync(args.output) ? writeFileSync(args.output, '') : null
+        result.forEach(post => {
+          appendFileSync(args.output, writeSql(post, args.comm, args.user))
+        })
+    } catch (error) {
+        console.error('Error processing files:', error);
+    }
+}
+
+// Run the main function
+unflatten(args.posts, args.comments);
+
+//console.log("HOLY FUCKING SMOKES!" + existsSync(tree))
+const outputPath = join(__dirname, '/', args.output);
--- a/src/parser.js
+++ b/src/parser.js
@ -0,0 +1,76 @@
+// parser.js, courtesy of claude.
+import { createReadStream } from 'fs';
+import { createInterface } from 'readline';
+
+class ProcessingContext {
+    constructor() {
+        this.nodes = new Map();
+        this.rootPosts = new Map();
+        this.pendingChildren = new Map();
+    }
+
+    processItem(item) {
+        const node = { ...item, children: [] };
+        this.nodes.set(item.name, node);
+        
+        const pendingChildren = this.pendingChildren.get(item.name) || [];
+        pendingChildren.forEach(child => {
+            node.children.push(child);
+        });
+        this.pendingChildren.delete(item.name);
+        
+        // important edit i had to make, it assumed that posts had a type null "parent_id" and it could be used to distinguish it as a post
+        // in reality that field doesnt exist so i changed it to check for an undefined type (i.e, no parent_id field)
+        if (item.parent_id === undefined) {
+            this.rootPosts.set(item.name, node);
+        } else {
+            const parent = this.nodes.get(item.parent_id);
+            if (parent) {
+                parent.children.push(node);
+            } else {
+                if (!this.pendingChildren.has(item.parent_id)) {
+                    this.pendingChildren.set(item.parent_id, []);
+                }
+                this.pendingChildren.get(item.parent_id).push(node);
+            }
+        }
+    }
+
+    getResult() {
+        return Array.from(this.rootPosts.values());
+    }
+}
+
+async function processPostsAndComments(postsFile, commentsFile) {
+    const context = new ProcessingContext();
+    
+    // Process posts first
+    const postsStream = createInterface({
+        input: createReadStream(postsFile),
+        crlfDelay: Infinity
+    });
+
+    for await (const line of postsStream) {
+        if (line.trim()) {
+            const post = JSON.parse(line);
+            context.processItem(post);
+        }
+    }
+
+    // Then process comments
+    const commentsStream = createInterface({
+        input: createReadStream(commentsFile),
+        crlfDelay: Infinity
+    });
+
+    for await (const line of commentsStream) {
+        if (line.trim()) {
+            const comment = JSON.parse(line);
+            context.processItem(comment);
+        }
+    }
+
+    return context.getResult();
+}
+
+export { processPostsAndComments };
--- a/src/sql.js
+++ b/src/sql.js
@ -0,0 +1,250 @@
+// shamelessly stolen code from https://github.com/mesmere/RedditLemmyImporter/blob/main/src/main/kotlin/write.kt
+// also reading the lemmy schema in lemmy/crates/db_schema/src/schema.rs
+import moment from 'moment';
+import he from 'he';
+
+/* // Utility function to escape values and wrap them in single quotes for SQL
+function lit(str) {
+  if (typeof str === 'string') {
+    // Decode HTML entities for the content to display properly and escape special characters for SQL
+    let decodedStr = decodeHTML(str);
+    return `'${decodedStr.replace(/'/g, "''").replace(/\\/g, '\\\\')}'`;  // Double escape single quotes and backslashes
+  } else {
+    return "NULL";  // Return 'NULL' if str is not a valid string
+  }
+} */
+
+// decodeHTML then replace all instances of ' with ''
+function lit(str) {
+  return typeof str === 'string' ? decodeHTML(str).replace(/'/g, "''") : 'null'
+}
+
+// Decode HTML entities (e.g., '&gt;' -> '>')
+function decodeHTML(html) {
+  return he.decode(html);  // Use the 'he' package to decode HTML entities
+}
+
+// Convert UTC Unix timestampt to Timestamptz in UTC (Postgres format)
+// 1650751789 -> '2022-04-23 22:09:49' and NOT '2022-04-24 10:09:49' or anything else
+// it'll match here on a browser where UTC is the default time (or if you're english)
+// https://www.reddit.com/r/4tran/comments/uag3ji/screw_you_passoid/ example
+// https://momentjs.com/docs/
+// +00 is for the db to know it is in UTC
+function toPostgresTimestamp(unixTimestamp) {
+  return `TIMESTAMP '${moment.unix(unixTimestamp).utc().format('YYYY-MM-DD HH:mm:ss')}+00'`;
+}
+
+// get timestamp for when the comment was edited or return null if it wasnt edited
+function mkUpdated(post) {
+  // edited being a timestamp is considered truey
+  return post.edited ? toPostgresTimestamp(post.edited) : 'null'
+}
+
+function mkTitle(post) {
+  // i <3 es6 :)
+  // decodeHTML early since it changes length and we want to check against the final length
+  if ( [...decodeHTML(post.title)].length > 200) {
+      // truncate, turn back to string, + ...
+      return [...decodeHTML(post.title)].slice(0, 197).join("")+"..."
+  } else { return decodeHTML(post.title) }
+}
+
+// wrap the url in singlequotes HERE and not in the query like '${lit(mkUrl(post))}'
+// this is because the null type in postgres will be turned into a string which will break lemmy until you remove the row or set it to null manually  
+function mkUrl(post) {
+  return post.is_gallery ? `'${getSubmissionImages(post)[0]}'` : post.is_self ? 'null' : `'${post.url}'`
+}
+
+/** 
+  * @returns {string} a post body with user/subreddit info, and images based on the post type
+  * @param post - Your json object for a submission or comment
+  * @param postType - "post" or "comment", the types are handled differently
+*/
+function mkBody(post, postType) {
+  // will be the final string
+  var outputBody = "";
+  
+  // add "/u/" assuming the user isnt deleted
+  // also add starting backtick (escaped) so its in a nice codeblock (differentiate from the actual post content)
+  outputBody += post.author == "[deleted]" ? `\`${post.author}` : `\`/u/${post.author}`
+
+  // add flair if it exists
+  outputBody += post.author_flair_text && post.author_flair_text.trim() ? ` - ${post.author_flair_text}` : ''
+
+  // add original subreddit
+  outputBody += ` - originally from /${post.subreddit_name_prefixed}\``;
+
+  // then add the post body if it has one.
+  // comments use a different field for the body (.body and not .selftext)
+  if (postType == "post") {
+      // add full title if truncated
+      // double newlines for reddit spacing is done at the START of the next addition, this way there arent trailing newlines if theres nothing else after. 
+      outputBody += [...post.title].length > 200 ? `\n\n\`Full title: ${post.title}\`` : ''
+
+      var postImages = getSubmissionImages(post);
+      // add "Gallery links:" then all the image links as bullet points if the post is a gallery
+      outputBody += postImages.length > 1 ? `\n\nGallery links:\n- ${post.url}${postImages.map(image => `\n\n- ${image}`).join('')}` : ''
+
+      // only if selftext exists, it wont exist if its an image post
+      outputBody += post.selftext && post.selftext.trim() ? `\n\n` + post.selftext : ''
+  } else if (postType == "comment") {
+      outputBody += post.body && post.body.trim() ? `\n\n` + post.body : ''
+  } else console.log("YOU FUCKED UP THE MKBODY CALL")
+
+  return outputBody;
+}
+
+/** 
+  * @returns {String[]} - the array which contains the urls of the best quality image versions, preserving the original order the post had
+  * @param post - Your json object for a submission
+*/
+function getSubmissionImages(post) {
+  // is_gallery is when there is >1 image (shocker)
+  if (post.is_gallery) {
+      var gallery_collection = [];
+      // iterate through JSON keys of gallery_data.
+      // we need to get the keys/IDs from gallery_data since its items are in the proper order that the gallery is,
+      // media_metadata sorts the images by alphanumerical order, and we want to preserve the proper order of the gallery
+      // still, we use the keys we get from gallery_data to get the links from media_metadata
+      post.gallery_data.items.forEach(image => {
+          // index media_metadata using the "media_id" of the current item we have iterated over in gallery_data
+          // in the item we have indexed:
+          // s = data for the best quality version of the image
+          // s.u = the url of it
+          // so we get s.u of the current key and push it to the array
+          gallery_collection.push(decodeHTML(post.media_metadata[image.media_id].s.u));
+      });
+      return gallery_collection;
+  } else {
+      // return the url of the post (which is the link to the image when not a gallery)
+      return [post.url];
+  }
+}
+
+
+// Ltrees ("path" column in the "comment" table) in lemmy: 0.comment.reply.[...].reply
+// where comment and reply are ids in the comment table
+
+/* 
+  make functions for:
+    ✔making a string literal to handle sql injection 
+    ✔handling a timestamp to work as a Timestamptz in UTC (e.g 2025-01-16 09:23:43.308171+00)
+    ✔create the post body with selftext and additional info (like user info and flair)
+      ✔if the post title is too long then add the full title to the body too (see line below)
+      ✔also handle images in posts
+      ✔if getSubmissionImages(post).length > 1:
+      ✔forEach loop through the image links and add them to the post 'body'
+    ✔handle the title by truncating max length of 200 Unicode code POINTS instead of UTF-16 code UNITS which the String: length property counts (important)
+    ✔do the same with comment body
+    ✔output the url to the first image in a gallery if the post is a gallery, if not, then output the url field if !post.is_self
+
+  
+  ✔for the sql create a function that takes a post object and:
+    ✔DECLARE 3 variables as INTEGER:
+      ✔comm_id, user_id, post_id
+    ✔then DECLARE variables named {[post/comment].name}_id as INTEGER
+    ✔do this by looping through every comment in the tree and do the post of course
+    ✔BEGIN
+    ✔SET comm_id and user_id based off their IDs in the database (community and person table) using the names given via user input here
+    ✔insert a row into the "post" table using the data while RETURNING the id of the post INTO STRICT as post_id;
+
+    ✔do the same in "post_aggregate" for the score minus the returning bit
+
+    ✔loop through every comment in the tree
+      ✔import the comment as you did for the post, dont specify a path and just let the db set a default for now
+      ✔if its a top-level comment (parent_id and link_id will be the same), then leave it as is, since the defaulted path will be correct
+      ✔if it has a parent, then set the path to [the path of its parent] + [.] + [its own id]
+      ✔also, the tree must be looped through from the top-down, since you need to allocate ids for the highest comments first, since you wont be able to set the path right if a child is done first
+    ✔END
+
+    execute the query like this https://www.postgresql.org/docs/current/sql-do.html
+*/
+
+/** 
+  * @returns {string} an sql query for INSERTing the provided post and its entire comment tree into your lemmy database
+  * @param post - Your json object for a submission, it should have all of its comments in "children" (and ITS comments in "children" etc)
+  * @param targetCommName - name of the community you want the archive to be added to
+  * @param targetUserName - name of the user you want to create the archive under
+*/
+function writeSql(post, targetCommName, targetUserName) {
+  var query = '';
+  query += "DECLARE comm_id INTEGER;\nDECLARE user_id INTEGER;\nDECLARE root_post_id INTEGER;\n"
+  query += `DECLARE ${post.name}_id CONSTANT INTEGER := NULL;\n`
+
+  {
+    const stack = [...post.children || []]; // Create a shallow copy of the children array to keep the original intact
+    let index = 0; // Use an index pointer to track the position in the stack
+  
+    while (index < stack.length) {
+      const current = stack[index]; // Get the current item to process
+      index++; // Move the index forward
+  
+      // DECLARE an integer for the current comment
+      query += `DECLARE ${current.name}_id INTEGER;\n`;
+  
+      // If it has children, add them to the stack
+      if (current.children) {
+        stack.push(...current.children); // Spread to add each child to the stack
+      }
+    }
+  }
+
+  query += "BEGIN\n"
+
+  // Get the IDs for the target comm and user
+  query += `SELECT id INTO STRICT comm_id FROM community WHERE name='${targetCommName}';\n`
+  query += `SELECT id INTO STRICT user_id FROM person WHERE name='${targetUserName}';\n`
+
+  // Insert the post.
+  query += 
+  `INSERT INTO post (name, url, body, creator_id, community_id, locked, published, featured_community) ` +
+  `VALUES ('${lit(mkTitle(post))}', ${mkUrl(post)}, '${lit(mkBody(post, "post"))}', user_id, comm_id, ${post.locked}, ${toPostgresTimestamp(post.created_utc)}, ${post.stickied}) ` +
+  `RETURNING id INTO STRICT root_post_id;\n`
+
+  // Likes
+  // We use post_aggregates and not post_like to change the score, unlike the kotlin RedditLemmyImporter
+  // This is because the like entries are now invalidated unless they are a score of -1 or 1 (likely because of the view upvotes feature? i'm probably wrong tho)
+  // post_aggregates is better to change anyway since its an INT8 and not an INT2, which means the score doesnt cap out at 2¹⁵-1, perhaps needing multiple entries for super popular posts
+  // we use an UPDATE statement since there is already a row that gets made when we inserted the post prior
+  // note: hovering over the upvote count will show the sad truth :(
+  query += `UPDATE post_aggregates SET score=${post.score} WHERE post_id=root_post_id;\n`
+
+  // Traverse again but INSERT this time (this could probably be a function)
+  {
+    const stack = [...post.children || []]; // Create a shallow copy of the children array to keep the original intact
+    let index = 0; // Use an index pointer to track the position in the stack
+  
+    while (index < stack.length) {
+      const current = stack[index]; // Get the current item to process
+      index++; // Move the index forward
+  
+      console.log("bye")
+      // Insert the current comment
+      query += 
+      `INSERT INTO comment (creator_id, post_id, content, published, updated) ` +
+      `VALUES (user_id, root_post_id, '${lit(mkBody(current, "comment"))}', ${toPostgresTimestamp(current.created_utc)}, ${mkUpdated(current)}) ` +
+      `RETURNING id INTO STRICT ${current.name}_id;\n`
+
+      // if the parent and root post aren't the same (i.e its not a top level comment)
+      if (current.link_id != current.parent_id) {
+        // update the path to be its parent path + its own id
+        // this works since postgres fills in the gaps, if you update it to its parents path, it will actually end up as [parents path + .ownID]
+        query += `UPDATE comment SET path = (SELECT path FROM comment WHERE id=${current.parent_id}_id) WHERE id=${current.name}_id;\n`
+      }
+
+      query += `UPDATE comment_aggregates SET upvotes=${current.score} WHERE comment_id=${current.name}_id;\n`
+
+  
+      // If it has children, add them to the stack
+      if (current.children) {
+        stack.push(...current.children); // Spread to add each child to the stack
+      }
+    }
+  }
+
+  query += "END"
+
+  return `DO $$ ${query} $$;\n\n`
+}
+
+export { writeSql }