remove unused json fields when parsing to reduce memory/file size of the final json

2025-01-20 23:32:05 +13:00
parent 7b6b69141c
commit 5a2e268f3c
3 changed files with 31 additions and 10 deletions
--- a/README.md
+++ b/README.md
@ -22,9 +22,9 @@ You can build the SQL script before making the comm/user though.

 - - since right now it just changes the upvotes to be negative or whatever the score is

- Remove the json fields from comments and posts that don't get used for importing, so that the final array of trees of posts/nested comments takes up less memory/space.
+- Remove the json fields from comments and posts that don't get used for importing, so that the final array of trees of posts/nested comments takes up less memory/space. ✔

- Save the final json to a file and read it through StreamArray, so that the memory is freed once the archive has finished processing
+- Save the final json to a file and read it through StreamArray, so that the memory is freed once the archive has finished processing ✔

 ## references

--- a/src/index.js
+++ b/src/index.js
@ -32,7 +32,7 @@ var args = yargs(process.argv.slice(2))
    })
    .string(['comm', 'user', 'output', 'posts', 'comments'])
    .nargs(['comm', 'user', 'output', 'posts', 'comments'], 1)
-    .demandOption(['comm', 'user', 'output', 'posts', 'comments'])
+    .demandOption(['comm', 'user',  'posts', 'comments'])
    .help('h')
    .alias('h', 'help')
    .epilog("git: https://git.stardust.wtf/starlight/reddit-lemmy-importer")
@ -44,8 +44,6 @@ processPostsAndComments(args.posts, args.comments, (result) => {
  console.log(result)
 }); */

-console.log(args.output?.trim())
-
 function printThreadStructure(thread, level = 0) {
    thread.forEach(item => {
      var out = '';
@ -65,29 +63,34 @@ async function unflatten(postsFile, commentsFile) {
        //console.log('Thread Structure:');
        //printThreadStructure(result);
        
-        const resultOutput = `${result[0].subreddit}-threads.json`;
+        const subredditName = result[0].subreddit_name_prefixed.slice(2);
+        const resultOutput = `${subredditName}-threads.json`;

        // Write the result to a file
        writeFileSync(resultOutput, JSON.stringify(result, null, 2));

        result = {};
        
+        // read the threads through a stream
        const pipeline = createReadStream(resultOutput).pipe(streamArray.withParser());
        
        // empty the sql file if it exists
        existsSync(args.output) ? writeFileSync(args.output, '') : null
-        const sqlOutput = createWriteStream(args.output, {flags: "a"});
+        // write the sql to the user specified path, or use [subredditname].sql if none is chosen
+        const sqlOutput = createWriteStream(args.output?.trim() ? args.output : `${subredditName}.sql`, {flags: "a"});

        var threadCounter = 0;

+        // create an sql query to make a lemmy thread for each json object
        pipeline.on('data', (thread) => {
          sqlOutput.write(writeSql(thread.value, args.comm, args.user));
          threadCounter++;
        });

+        // close the stream and say how many threads are processed and where its saved
        pipeline.on('end', () => {
          sqlOutput.close();
-          console.log(`Finished processing ${threadCounter} threads, sql saved to ${resultOutput}`);
+          console.log(`Finished processing ${threadCounter} threads, sql saved to ${sqlOutput.path}.`);
        });

        // old
--- a/src/parser.js
+++ b/src/parser.js
@ -41,8 +41,26 @@ class ProcessingContext {
    }
 }

+// remove all fields from jsonObj that arent in the allowedKeys array
+function filterJsonKeys(jsonObj, allowedKeys) {
+    // Input validation
+    if (typeof jsonObj !== 'object' || jsonObj === null) {
+        throw new TypeError('Input must be an object');
+    }
+    
+    return Object.fromEntries(
+        Object.entries(jsonObj)
+            .filter(([key]) => allowedKeys.includes(key))
+    );
+}
+
 async function processPostsAndComments(postsFile, commentsFile) {
    const context = new ProcessingContext();
+
+    // allowed fields
+    // reduced the size of my dummy json to about 31% of its size without the filters
+    const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "edited", "gallery_data", "is_gallery", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit_name_prefixed", "title", "url"];
+    const commentKeysAllowed = ["author", "author_flair_text", "body", "created_utc", "link_id", "name", "parent_id", "score", "subreddit_name_prefixed"];
    
    // Process posts first
    const postsStream = createInterface({
@ -52,7 +70,7 @@ async function processPostsAndComments(postsFile, commentsFile) {

    for await (const line of postsStream) {
        if (line.trim()) {
-            const post = JSON.parse(line);
+            const post = filterJsonKeys(JSON.parse(line), submissionKeysAllowed);
            context.processItem(post);
        }
    }
@ -65,7 +83,7 @@ async function processPostsAndComments(postsFile, commentsFile) {

    for await (const line of commentsStream) {
        if (line.trim()) {
-            const comment = JSON.parse(line);
+            const comment = filterJsonKeys(JSON.parse(line), commentKeysAllowed);
            context.processItem(comment);
        }
    }