remove unused json fields when parsing to reduce memory/file size of the final json

This commit is contained in:
starlight 2025-01-20 23:32:05 +13:00
parent 7b6b69141c
commit 5a2e268f3c
3 changed files with 31 additions and 10 deletions

View File

@ -22,9 +22,9 @@ You can build the SQL script before making the comm/user though.
- - since right now it just changes the upvotes to be negative or whatever the score is - - since right now it just changes the upvotes to be negative or whatever the score is
- Remove the json fields from comments and posts that don't get used for importing, so that the final array of trees of posts/nested comments takes up less memory/space. - Remove the json fields from comments and posts that don't get used for importing, so that the final array of trees of posts/nested comments takes up less memory/space.
- Save the final json to a file and read it through StreamArray, so that the memory is freed once the archive has finished processing - Save the final json to a file and read it through StreamArray, so that the memory is freed once the archive has finished processing
## references ## references

View File

@ -32,7 +32,7 @@ var args = yargs(process.argv.slice(2))
}) })
.string(['comm', 'user', 'output', 'posts', 'comments']) .string(['comm', 'user', 'output', 'posts', 'comments'])
.nargs(['comm', 'user', 'output', 'posts', 'comments'], 1) .nargs(['comm', 'user', 'output', 'posts', 'comments'], 1)
.demandOption(['comm', 'user', 'output', 'posts', 'comments']) .demandOption(['comm', 'user', 'posts', 'comments'])
.help('h') .help('h')
.alias('h', 'help') .alias('h', 'help')
.epilog("git: https://git.stardust.wtf/starlight/reddit-lemmy-importer") .epilog("git: https://git.stardust.wtf/starlight/reddit-lemmy-importer")
@ -44,8 +44,6 @@ processPostsAndComments(args.posts, args.comments, (result) => {
console.log(result) console.log(result)
}); */ }); */
console.log(args.output?.trim())
function printThreadStructure(thread, level = 0) { function printThreadStructure(thread, level = 0) {
thread.forEach(item => { thread.forEach(item => {
var out = ''; var out = '';
@ -65,29 +63,34 @@ async function unflatten(postsFile, commentsFile) {
//console.log('Thread Structure:'); //console.log('Thread Structure:');
//printThreadStructure(result); //printThreadStructure(result);
const resultOutput = `${result[0].subreddit}-threads.json`; const subredditName = result[0].subreddit_name_prefixed.slice(2);
const resultOutput = `${subredditName}-threads.json`;
// Write the result to a file // Write the result to a file
writeFileSync(resultOutput, JSON.stringify(result, null, 2)); writeFileSync(resultOutput, JSON.stringify(result, null, 2));
result = {}; result = {};
// read the threads through a stream
const pipeline = createReadStream(resultOutput).pipe(streamArray.withParser()); const pipeline = createReadStream(resultOutput).pipe(streamArray.withParser());
// empty the sql file if it exists // empty the sql file if it exists
existsSync(args.output) ? writeFileSync(args.output, '') : null existsSync(args.output) ? writeFileSync(args.output, '') : null
const sqlOutput = createWriteStream(args.output, {flags: "a"}); // write the sql to the user specified path, or use [subredditname].sql if none is chosen
const sqlOutput = createWriteStream(args.output?.trim() ? args.output : `${subredditName}.sql`, {flags: "a"});
var threadCounter = 0; var threadCounter = 0;
// create an sql query to make a lemmy thread for each json object
pipeline.on('data', (thread) => { pipeline.on('data', (thread) => {
sqlOutput.write(writeSql(thread.value, args.comm, args.user)); sqlOutput.write(writeSql(thread.value, args.comm, args.user));
threadCounter++; threadCounter++;
}); });
// close the stream and say how many threads are processed and where its saved
pipeline.on('end', () => { pipeline.on('end', () => {
sqlOutput.close(); sqlOutput.close();
console.log(`Finished processing ${threadCounter} threads, sql saved to ${resultOutput}`); console.log(`Finished processing ${threadCounter} threads, sql saved to ${sqlOutput.path}.`);
}); });
// old // old

View File

@ -41,9 +41,27 @@ class ProcessingContext {
} }
} }
// remove all fields from jsonObj that arent in the allowedKeys array
function filterJsonKeys(jsonObj, allowedKeys) {
// Input validation
if (typeof jsonObj !== 'object' || jsonObj === null) {
throw new TypeError('Input must be an object');
}
return Object.fromEntries(
Object.entries(jsonObj)
.filter(([key]) => allowedKeys.includes(key))
);
}
async function processPostsAndComments(postsFile, commentsFile) { async function processPostsAndComments(postsFile, commentsFile) {
const context = new ProcessingContext(); const context = new ProcessingContext();
// allowed fields
// reduced the size of my dummy json to about 31% of its size without the filters
const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "edited", "gallery_data", "is_gallery", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit_name_prefixed", "title", "url"];
const commentKeysAllowed = ["author", "author_flair_text", "body", "created_utc", "link_id", "name", "parent_id", "score", "subreddit_name_prefixed"];
// Process posts first // Process posts first
const postsStream = createInterface({ const postsStream = createInterface({
input: createReadStream(postsFile), input: createReadStream(postsFile),
@ -52,7 +70,7 @@ async function processPostsAndComments(postsFile, commentsFile) {
for await (const line of postsStream) { for await (const line of postsStream) {
if (line.trim()) { if (line.trim()) {
const post = JSON.parse(line); const post = filterJsonKeys(JSON.parse(line), submissionKeysAllowed);
context.processItem(post); context.processItem(post);
} }
} }
@ -65,7 +83,7 @@ async function processPostsAndComments(postsFile, commentsFile) {
for await (const line of commentsStream) { for await (const line of commentsStream) {
if (line.trim()) { if (line.trim()) {
const comment = JSON.parse(line); const comment = filterJsonKeys(JSON.parse(line), commentKeysAllowed);
context.processItem(comment); context.processItem(comment);
} }
} }