remove unused json fields when parsing to reduce memory/file size of the final json

write parsed result to a file, read the result and write sql through streams
2025-01-20 23:32:05 +13:00 · 2025-01-20 22:54:40 +13:00
7 changed files with 78 additions and 18 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,4 +3,4 @@ node_modules/
 *.sql
 ok
 src/test.js
-processed-threads.json
+*-threads.json
--- a/README.md
+++ b/README.md
@ -22,9 +22,9 @@ You can build the SQL script before making the comm/user though.

 - - since right now it just changes the upvotes to be negative or whatever the score is

- Remove the json fields from comments and posts that don't get used for importing, so that the final array of trees of posts/nested comments takes up less memory/space.
+- Remove the json fields from comments and posts that don't get used for importing, so that the final array of trees of posts/nested comments takes up less memory/space. ✔

- Save the final json to a file and read it through StreamArray, so that the memory is freed once the archive has finished processing
+- Save the final json to a file and read it through StreamArray, so that the memory is freed once the archive has finished processing ✔

 ## references

--- a/package.json
+++ b/package.json
@ -10,6 +10,7 @@
    "chalk": "^5.3.0",
    "he": "^1.2.0",
    "moment": "^2.30.1",
+    "stream-json": "^1.9.1",
    "yargs": "^17.7.2"
  },
  "scripts": {
--- a/src/index.js
+++ b/src/index.js
@ -2,20 +2,22 @@ import { processPostsAndComments } from './parser.js';
 import { writeSql } from './sql.js';

 import { join } from "node:path";
-// es6 >:( 
+
+// es6 path >:( 
 import path from 'path';
 import {
  fileURLToPath
 } from 'url';
-
-import { exists, existsSync, writeFileSync, appendFileSync } from 'node:fs';
-
 export const __filename = fileURLToPath(
  import.meta.url);
 export const __dirname = path.dirname(__filename);

+import { existsSync, writeFileSync, createReadStream, createWriteStream } from 'node:fs';
+
 import yargs from 'yargs';

+import streamArray from 'stream-json/streamers/StreamArray.js';
+
 // https://github.com/yargs/yargs/blob/main/docs/examples.md section "Yargs is here to help you..."
 var args = yargs(process.argv.slice(2))
    .alias('c', 'comm')
@ -30,7 +32,7 @@ var args = yargs(process.argv.slice(2))
    })
    .string(['comm', 'user', 'output', 'posts', 'comments'])
    .nargs(['comm', 'user', 'output', 'posts', 'comments'], 1)
-    .demandOption(['comm', 'user', 'output', 'posts', 'comments'])
+    .demandOption(['comm', 'user',  'posts', 'comments'])
    .help('h')
    .alias('h', 'help')
    .epilog("git: https://git.stardust.wtf/starlight/reddit-lemmy-importer")
@ -57,18 +59,44 @@ function printThreadStructure(thread, level = 0) {

 async function unflatten(postsFile, commentsFile) {
    try {
-        const result = await processPostsAndComments(postsFile, commentsFile);
+        var result = await processPostsAndComments(postsFile, commentsFile);
        //console.log('Thread Structure:');
        //printThreadStructure(result);
        
-        // Optional: write the result to a file
-        //writeFileSync('processed-threads.json', JSON.stringify(result, null, 2));
+        const subredditName = result[0].subreddit_name_prefixed.slice(2);
+        const resultOutput = `${subredditName}-threads.json`;

-        // empty the file if it exists
+        // Write the result to a file
+        writeFileSync(resultOutput, JSON.stringify(result, null, 2));
+
+        result = {};
+        
+        // read the threads through a stream
+        const pipeline = createReadStream(resultOutput).pipe(streamArray.withParser());
+        
+        // empty the sql file if it exists
        existsSync(args.output) ? writeFileSync(args.output, '') : null
-        result.forEach(post => {
+        // write the sql to the user specified path, or use [subredditname].sql if none is chosen
+        const sqlOutput = createWriteStream(args.output?.trim() ? args.output : `${subredditName}.sql`, {flags: "a"});
+
+        var threadCounter = 0;
+
+        // create an sql query to make a lemmy thread for each json object
+        pipeline.on('data', (thread) => {
+          sqlOutput.write(writeSql(thread.value, args.comm, args.user));
+          threadCounter++;
+        });
+
+        // close the stream and say how many threads are processed and where its saved
+        pipeline.on('end', () => {
+          sqlOutput.close();
+          console.log(`Finished processing ${threadCounter} threads, sql saved to ${sqlOutput.path}.`);
+        });
+
+        // old
+        /* result.forEach(post => {
          appendFileSync(args.output, writeSql(post, args.comm, args.user))
-        })
+        }) */
    } catch (error) {
        console.error('Error processing files:', error);
    }
@ -77,5 +105,4 @@ async function unflatten(postsFile, commentsFile) {
 // Run the main function
 unflatten(args.posts, args.comments);

-//console.log("HOLY FUCKING SMOKES!" + existsSync(tree))
-const outputPath = join(__dirname, '/', args.output);
+//const outputPath = join(__dirname, '/', args.output);
--- a/src/parser.js
+++ b/src/parser.js
@ -41,8 +41,26 @@ class ProcessingContext {
    }
 }

+// remove all fields from jsonObj that arent in the allowedKeys array
+function filterJsonKeys(jsonObj, allowedKeys) {
+    // Input validation
+    if (typeof jsonObj !== 'object' || jsonObj === null) {
+        throw new TypeError('Input must be an object');
+    }
+    
+    return Object.fromEntries(
+        Object.entries(jsonObj)
+            .filter(([key]) => allowedKeys.includes(key))
+    );
+}
+
 async function processPostsAndComments(postsFile, commentsFile) {
    const context = new ProcessingContext();
+
+    // allowed fields
+    // reduced the size of my dummy json to about 31% of its size without the filters
+    const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "edited", "gallery_data", "is_gallery", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit_name_prefixed", "title", "url"];
+    const commentKeysAllowed = ["author", "author_flair_text", "body", "created_utc", "link_id", "name", "parent_id", "score", "subreddit_name_prefixed"];
    
    // Process posts first
    const postsStream = createInterface({
@ -52,7 +70,7 @@ async function processPostsAndComments(postsFile, commentsFile) {

    for await (const line of postsStream) {
        if (line.trim()) {
-            const post = JSON.parse(line);
+            const post = filterJsonKeys(JSON.parse(line), submissionKeysAllowed);
            context.processItem(post);
        }
    }
@ -65,7 +83,7 @@ async function processPostsAndComments(postsFile, commentsFile) {

    for await (const line of commentsStream) {
        if (line.trim()) {
-            const comment = JSON.parse(line);
+            const comment = filterJsonKeys(JSON.parse(line), commentKeysAllowed);
            context.processItem(comment);
        }
    }
--- a/src/sql.js
+++ b/src/sql.js
@ -1,5 +1,7 @@
 // shamelessly stolen code from https://github.com/mesmere/RedditLemmyImporter/blob/main/src/main/kotlin/write.kt
 // also reading the lemmy schema in lemmy/crates/db_schema/src/schema.rs
+// reads the created tree of a post and its comments and builds a json query to add it to your lemmy comm
+
 import moment from 'moment';
 import he from 'he';

--- a/yarn.lock
+++ b/yarn.lock
@ -75,6 +75,18 @@ require-directory@^2.1.1:
  resolved "https://registry.yarnpkg.com/require-directory/-/require-directory-2.1.1.tgz#8c64ad5fd30dab1c976e2344ffe7f792a6a6df42"
  integrity sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==

+stream-chain@^2.2.5:
+  version "2.2.5"
+  resolved "https://registry.yarnpkg.com/stream-chain/-/stream-chain-2.2.5.tgz#b30967e8f14ee033c5b9a19bbe8a2cba90ba0d09"
+  integrity sha512-1TJmBx6aSWqZ4tx7aTpBDXK0/e2hhcNSTV8+CbFJtDjbb+I1mZ8lHit0Grw9GRT+6JbIrrDd8esncgBi8aBXGA==
+
+stream-json@^1.9.1:
+  version "1.9.1"
+  resolved "https://registry.yarnpkg.com/stream-json/-/stream-json-1.9.1.tgz#e3fec03e984a503718946c170db7d74556c2a187"
+  integrity sha512-uWkjJ+2Nt/LO9Z/JyKZbMusL8Dkh97uUBTv3AJQ74y07lVahLY4eEFsPsE97pxYBwr8nnjMAIch5eqI0gPShyw==
+  dependencies:
+    stream-chain "^2.2.5"
+
 string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3:
  version "4.2.3"
  resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"
Author	SHA1	Message	Date
starlight	5a2e268f3c	remove unused json fields when parsing to reduce memory/file size of the final json	2025-01-20 23:32:05 +13:00
starlight	7b6b69141c	write parsed result to a file, read the result and write sql through streams	2025-01-20 22:54:40 +13:00