diff --git a/README.md b/README.md index 79a7471..9b74a2d 100644 --- a/README.md +++ b/README.md @@ -22,9 +22,9 @@ You can build the SQL script before making the comm/user though. - - since right now it just changes the upvotes to be negative or whatever the score is -- Remove the json fields from comments and posts that don't get used for importing, so that the final array of trees of posts/nested comments takes up less memory/space. +- Remove the json fields from comments and posts that don't get used for importing, so that the final array of trees of posts/nested comments takes up less memory/space. ✔ -- Save the final json to a file and read it through StreamArray, so that the memory is freed once the archive has finished processing +- Save the final json to a file and read it through StreamArray, so that the memory is freed once the archive has finished processing ✔ ## references diff --git a/src/index.js b/src/index.js index 5250f2c..585f591 100644 --- a/src/index.js +++ b/src/index.js @@ -32,7 +32,7 @@ var args = yargs(process.argv.slice(2)) }) .string(['comm', 'user', 'output', 'posts', 'comments']) .nargs(['comm', 'user', 'output', 'posts', 'comments'], 1) - .demandOption(['comm', 'user', 'output', 'posts', 'comments']) + .demandOption(['comm', 'user', 'posts', 'comments']) .help('h') .alias('h', 'help') .epilog("git: https://git.stardust.wtf/starlight/reddit-lemmy-importer") @@ -44,8 +44,6 @@ processPostsAndComments(args.posts, args.comments, (result) => { console.log(result) }); */ -console.log(args.output?.trim()) - function printThreadStructure(thread, level = 0) { thread.forEach(item => { var out = ''; @@ -65,29 +63,34 @@ async function unflatten(postsFile, commentsFile) { //console.log('Thread Structure:'); //printThreadStructure(result); - const resultOutput = `${result[0].subreddit}-threads.json`; + const subredditName = result[0].subreddit_name_prefixed.slice(2); + const resultOutput = `${subredditName}-threads.json`; // Write the result to a file writeFileSync(resultOutput, JSON.stringify(result, null, 2)); result = {}; + // read the threads through a stream const pipeline = createReadStream(resultOutput).pipe(streamArray.withParser()); // empty the sql file if it exists existsSync(args.output) ? writeFileSync(args.output, '') : null - const sqlOutput = createWriteStream(args.output, {flags: "a"}); + // write the sql to the user specified path, or use [subredditname].sql if none is chosen + const sqlOutput = createWriteStream(args.output?.trim() ? args.output : `${subredditName}.sql`, {flags: "a"}); var threadCounter = 0; + // create an sql query to make a lemmy thread for each json object pipeline.on('data', (thread) => { sqlOutput.write(writeSql(thread.value, args.comm, args.user)); threadCounter++; }); + // close the stream and say how many threads are processed and where its saved pipeline.on('end', () => { sqlOutput.close(); - console.log(`Finished processing ${threadCounter} threads, sql saved to ${resultOutput}`); + console.log(`Finished processing ${threadCounter} threads, sql saved to ${sqlOutput.path}.`); }); // old diff --git a/src/parser.js b/src/parser.js index c49b768..e70dc77 100644 --- a/src/parser.js +++ b/src/parser.js @@ -41,8 +41,26 @@ class ProcessingContext { } } +// remove all fields from jsonObj that arent in the allowedKeys array +function filterJsonKeys(jsonObj, allowedKeys) { + // Input validation + if (typeof jsonObj !== 'object' || jsonObj === null) { + throw new TypeError('Input must be an object'); + } + + return Object.fromEntries( + Object.entries(jsonObj) + .filter(([key]) => allowedKeys.includes(key)) + ); +} + async function processPostsAndComments(postsFile, commentsFile) { const context = new ProcessingContext(); + + // allowed fields + // reduced the size of my dummy json to about 31% of its size without the filters + const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "edited", "gallery_data", "is_gallery", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit_name_prefixed", "title", "url"]; + const commentKeysAllowed = ["author", "author_flair_text", "body", "created_utc", "link_id", "name", "parent_id", "score", "subreddit_name_prefixed"]; // Process posts first const postsStream = createInterface({ @@ -52,7 +70,7 @@ async function processPostsAndComments(postsFile, commentsFile) { for await (const line of postsStream) { if (line.trim()) { - const post = JSON.parse(line); + const post = filterJsonKeys(JSON.parse(line), submissionKeysAllowed); context.processItem(post); } } @@ -65,7 +83,7 @@ async function processPostsAndComments(postsFile, commentsFile) { for await (const line of commentsStream) { if (line.trim()) { - const comment = JSON.parse(line); + const comment = filterJsonKeys(JSON.parse(line), commentKeysAllowed); context.processItem(comment); } }