From c76e3e7cbba3a81d30b068cf84047dbadd37f99f Mon Sep 17 00:00:00 2001 From: starlight Date: Tue, 21 Jan 2025 21:08:26 +1300 Subject: [PATCH] fix sql building for gallery posts that are deleted and galleries that have images that are status: failed --- src/index.js | 18 +++++++++++++----- src/parser.js | 7 +++++-- src/sql.js | 28 ++++++++++++++++++++++------ 3 files changed, 40 insertions(+), 13 deletions(-) diff --git a/src/index.js b/src/index.js index 585f591..18fa691 100644 --- a/src/index.js +++ b/src/index.js @@ -63,28 +63,36 @@ async function unflatten(postsFile, commentsFile) { //console.log('Thread Structure:'); //printThreadStructure(result); - const subredditName = result[0].subreddit_name_prefixed.slice(2); - const resultOutput = `${subredditName}-threads.json`; + const resultOutput = `${args.comm}-threads.json`; + const subredditName = result[0].subreddit + const sqlOutputPath = args.output?.trim() ? args.output : `${subredditName}.sql`; // Write the result to a file writeFileSync(resultOutput, JSON.stringify(result, null, 2)); + // console.log(result[357]) result = {}; // read the threads through a stream const pipeline = createReadStream(resultOutput).pipe(streamArray.withParser()); // empty the sql file if it exists - existsSync(args.output) ? writeFileSync(args.output, '') : null - // write the sql to the user specified path, or use [subredditname].sql if none is chosen - const sqlOutput = createWriteStream(args.output?.trim() ? args.output : `${subredditName}.sql`, {flags: "a"}); + existsSync(sqlOutputPath) ? writeFileSync(sqlOutputPath, '') : null + // write the sql to the path + const sqlOutput = createWriteStream(sqlOutputPath, {flags: "a"}); var threadCounter = 0; // create an sql query to make a lemmy thread for each json object pipeline.on('data', (thread) => { + try{ sqlOutput.write(writeSql(thread.value, args.comm, args.user)); threadCounter++; + //console.log(threadCounter) + //threadCounter == 467 ? console.log( `${threadCounter} ${thread}`) : null + } catch (error) { + console.error('Error processing post:', error); + } }); // close the stream and say how many threads are processed and where its saved diff --git a/src/parser.js b/src/parser.js index e70dc77..3045107 100644 --- a/src/parser.js +++ b/src/parser.js @@ -59,8 +59,8 @@ async function processPostsAndComments(postsFile, commentsFile) { // allowed fields // reduced the size of my dummy json to about 31% of its size without the filters - const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "edited", "gallery_data", "is_gallery", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit_name_prefixed", "title", "url"]; - const commentKeysAllowed = ["author", "author_flair_text", "body", "created_utc", "link_id", "name", "parent_id", "score", "subreddit_name_prefixed"]; + const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "edited", "gallery_data", "is_gallery", "is_self", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit", "title", "url"]; + const commentKeysAllowed = ["author", "author_flair_text", "body", "created_utc", "link_id", "name", "parent_id", "score", "subreddit"]; // Process posts first const postsStream = createInterface({ @@ -68,10 +68,13 @@ async function processPostsAndComments(postsFile, commentsFile) { crlfDelay: Infinity }); + var dbgpost = 0; for await (const line of postsStream) { if (line.trim()) { const post = filterJsonKeys(JSON.parse(line), submissionKeysAllowed); context.processItem(post); + //dbgpost==467?console.log(dbgpost + line):null; + dbgpost++; } } diff --git a/src/sql.js b/src/sql.js index c11beed..9d41c1e 100644 --- a/src/sql.js +++ b/src/sql.js @@ -74,7 +74,8 @@ function mkBody(post, postType) { outputBody += post.author_flair_text && post.author_flair_text.trim() ? ` - ${post.author_flair_text}` : '' // add original subreddit - outputBody += ` - originally from /${post.subreddit_name_prefixed}\``; + // changed from post.subreddit_name_prefixed to post.subreddit since the former isnt in all posts + outputBody += ` - originally from /r/${post.subreddit}\``; // then add the post body if it has one. // comments use a different field for the body (.body and not .selftext) @@ -91,7 +92,7 @@ function mkBody(post, postType) { outputBody += post.selftext && post.selftext.trim() ? `\n\n` + post.selftext : '' } else if (postType == "comment") { outputBody += post.body && post.body.trim() ? `\n\n` + post.body : '' - } else console.log("YOU FUCKED UP THE MKBODY CALL") + } else console.error("YOU FUCKED UP THE MKBODY CALL") return outputBody; } @@ -102,19 +103,34 @@ function mkBody(post, postType) { */ function getSubmissionImages(post) { // is_gallery is when there is >1 image (shocker) - if (post.is_gallery) { + // deleted gallery posts have their gallery_data and media_metadata erased too + if (post.is_gallery && post.gallery_data != null && post.media_metadata != null) { var gallery_collection = []; // iterate through JSON keys of gallery_data. // we need to get the keys/IDs from gallery_data since its items are in the proper order that the gallery is, // media_metadata sorts the images by alphanumerical order, and we want to preserve the proper order of the gallery // still, we use the keys we get from gallery_data to get the links from media_metadata - post.gallery_data.items.forEach(image => { + post.gallery_data.items.forEach(fileIDs => { // index media_metadata using the "media_id" of the current item we have iterated over in gallery_data // in the item we have indexed: // s = data for the best quality version of the image // s.u = the url of it // so we get s.u of the current key and push it to the array - gallery_collection.push(decodeHTML(post.media_metadata[image.media_id].s.u)); + // if the file is an "AnimatedImage", then the link for the highest quality is in the "gif"/"mp4" field + var file = post.media_metadata[fileIDs.media_id]; + + if (file.status == "valid") { + if (file.e == "Image"){ + gallery_collection.push(decodeHTML(file.s.u)) + } else if (file.e == "AnimatedImage") { + gallery_collection.push(decodeHTML(file.s.gif)) + } else { + console.error(`post ${post.name} image ${JSON.stringify(fileIDs.media_id)} in media_metadata has an unknown filetype: ${file.e}`); + } + + } /*else if (file.status == "failed") { + + }*/ }); return gallery_collection; } else { @@ -209,6 +225,7 @@ function writeSql(post, targetCommName, targetUserName) { // post_aggregates is better to change anyway since its an INT8 and not an INT2, which means the score doesnt cap out at 2¹⁵-1, perhaps needing multiple entries for super popular posts // we use an UPDATE statement since there is already a row that gets made when we inserted the post prior // note: hovering over the upvote count will show the sad truth :( + query += `UPDATE post_aggregates SET score=${post.score} WHERE post_id=root_post_id;\n` // Traverse again but INSERT this time (this could probably be a function) @@ -220,7 +237,6 @@ function writeSql(post, targetCommName, targetUserName) { const current = stack[index]; // Get the current item to process index++; // Move the index forward - console.log("bye") // Insert the current comment query += `INSERT INTO comment (creator_id, post_id, content, published, updated) ` +