fix sql building for gallery posts that are deleted and galleries that have images that are status: failed
This commit is contained in:
parent
7687c66092
commit
c76e3e7cbb
18
src/index.js
18
src/index.js
@ -63,28 +63,36 @@ async function unflatten(postsFile, commentsFile) {
|
||||
//console.log('Thread Structure:');
|
||||
//printThreadStructure(result);
|
||||
|
||||
const subredditName = result[0].subreddit_name_prefixed.slice(2);
|
||||
const resultOutput = `${subredditName}-threads.json`;
|
||||
const resultOutput = `${args.comm}-threads.json`;
|
||||
const subredditName = result[0].subreddit
|
||||
const sqlOutputPath = args.output?.trim() ? args.output : `${subredditName}.sql`;
|
||||
|
||||
// Write the result to a file
|
||||
writeFileSync(resultOutput, JSON.stringify(result, null, 2));
|
||||
|
||||
// console.log(result[357])
|
||||
result = {};
|
||||
|
||||
// read the threads through a stream
|
||||
const pipeline = createReadStream(resultOutput).pipe(streamArray.withParser());
|
||||
|
||||
// empty the sql file if it exists
|
||||
existsSync(args.output) ? writeFileSync(args.output, '') : null
|
||||
// write the sql to the user specified path, or use [subredditname].sql if none is chosen
|
||||
const sqlOutput = createWriteStream(args.output?.trim() ? args.output : `${subredditName}.sql`, {flags: "a"});
|
||||
existsSync(sqlOutputPath) ? writeFileSync(sqlOutputPath, '') : null
|
||||
// write the sql to the path
|
||||
const sqlOutput = createWriteStream(sqlOutputPath, {flags: "a"});
|
||||
|
||||
var threadCounter = 0;
|
||||
|
||||
// create an sql query to make a lemmy thread for each json object
|
||||
pipeline.on('data', (thread) => {
|
||||
try{
|
||||
sqlOutput.write(writeSql(thread.value, args.comm, args.user));
|
||||
threadCounter++;
|
||||
//console.log(threadCounter)
|
||||
//threadCounter == 467 ? console.log( `${threadCounter} ${thread}`) : null
|
||||
} catch (error) {
|
||||
console.error('Error processing post:', error);
|
||||
}
|
||||
});
|
||||
|
||||
// close the stream and say how many threads are processed and where its saved
|
||||
|
@ -59,8 +59,8 @@ async function processPostsAndComments(postsFile, commentsFile) {
|
||||
|
||||
// allowed fields
|
||||
// reduced the size of my dummy json to about 31% of its size without the filters
|
||||
const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "edited", "gallery_data", "is_gallery", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit_name_prefixed", "title", "url"];
|
||||
const commentKeysAllowed = ["author", "author_flair_text", "body", "created_utc", "link_id", "name", "parent_id", "score", "subreddit_name_prefixed"];
|
||||
const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "edited", "gallery_data", "is_gallery", "is_self", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit", "title", "url"];
|
||||
const commentKeysAllowed = ["author", "author_flair_text", "body", "created_utc", "link_id", "name", "parent_id", "score", "subreddit"];
|
||||
|
||||
// Process posts first
|
||||
const postsStream = createInterface({
|
||||
@ -68,10 +68,13 @@ async function processPostsAndComments(postsFile, commentsFile) {
|
||||
crlfDelay: Infinity
|
||||
});
|
||||
|
||||
var dbgpost = 0;
|
||||
for await (const line of postsStream) {
|
||||
if (line.trim()) {
|
||||
const post = filterJsonKeys(JSON.parse(line), submissionKeysAllowed);
|
||||
context.processItem(post);
|
||||
//dbgpost==467?console.log(dbgpost + line):null;
|
||||
dbgpost++;
|
||||
}
|
||||
}
|
||||
|
||||
|
28
src/sql.js
28
src/sql.js
@ -74,7 +74,8 @@ function mkBody(post, postType) {
|
||||
outputBody += post.author_flair_text && post.author_flair_text.trim() ? ` - ${post.author_flair_text}` : ''
|
||||
|
||||
// add original subreddit
|
||||
outputBody += ` - originally from /${post.subreddit_name_prefixed}\``;
|
||||
// changed from post.subreddit_name_prefixed to post.subreddit since the former isnt in all posts
|
||||
outputBody += ` - originally from /r/${post.subreddit}\``;
|
||||
|
||||
// then add the post body if it has one.
|
||||
// comments use a different field for the body (.body and not .selftext)
|
||||
@ -91,7 +92,7 @@ function mkBody(post, postType) {
|
||||
outputBody += post.selftext && post.selftext.trim() ? `\n\n` + post.selftext : ''
|
||||
} else if (postType == "comment") {
|
||||
outputBody += post.body && post.body.trim() ? `\n\n` + post.body : ''
|
||||
} else console.log("YOU FUCKED UP THE MKBODY CALL")
|
||||
} else console.error("YOU FUCKED UP THE MKBODY CALL")
|
||||
|
||||
return outputBody;
|
||||
}
|
||||
@ -102,19 +103,34 @@ function mkBody(post, postType) {
|
||||
*/
|
||||
function getSubmissionImages(post) {
|
||||
// is_gallery is when there is >1 image (shocker)
|
||||
if (post.is_gallery) {
|
||||
// deleted gallery posts have their gallery_data and media_metadata erased too
|
||||
if (post.is_gallery && post.gallery_data != null && post.media_metadata != null) {
|
||||
var gallery_collection = [];
|
||||
// iterate through JSON keys of gallery_data.
|
||||
// we need to get the keys/IDs from gallery_data since its items are in the proper order that the gallery is,
|
||||
// media_metadata sorts the images by alphanumerical order, and we want to preserve the proper order of the gallery
|
||||
// still, we use the keys we get from gallery_data to get the links from media_metadata
|
||||
post.gallery_data.items.forEach(image => {
|
||||
post.gallery_data.items.forEach(fileIDs => {
|
||||
// index media_metadata using the "media_id" of the current item we have iterated over in gallery_data
|
||||
// in the item we have indexed:
|
||||
// s = data for the best quality version of the image
|
||||
// s.u = the url of it
|
||||
// so we get s.u of the current key and push it to the array
|
||||
gallery_collection.push(decodeHTML(post.media_metadata[image.media_id].s.u));
|
||||
// if the file is an "AnimatedImage", then the link for the highest quality is in the "gif"/"mp4" field
|
||||
var file = post.media_metadata[fileIDs.media_id];
|
||||
|
||||
if (file.status == "valid") {
|
||||
if (file.e == "Image"){
|
||||
gallery_collection.push(decodeHTML(file.s.u))
|
||||
} else if (file.e == "AnimatedImage") {
|
||||
gallery_collection.push(decodeHTML(file.s.gif))
|
||||
} else {
|
||||
console.error(`post ${post.name} image ${JSON.stringify(fileIDs.media_id)} in media_metadata has an unknown filetype: ${file.e}`);
|
||||
}
|
||||
|
||||
} /*else if (file.status == "failed") {
|
||||
|
||||
}*/
|
||||
});
|
||||
return gallery_collection;
|
||||
} else {
|
||||
@ -209,6 +225,7 @@ function writeSql(post, targetCommName, targetUserName) {
|
||||
// post_aggregates is better to change anyway since its an INT8 and not an INT2, which means the score doesnt cap out at 2¹⁵-1, perhaps needing multiple entries for super popular posts
|
||||
// we use an UPDATE statement since there is already a row that gets made when we inserted the post prior
|
||||
// note: hovering over the upvote count will show the sad truth :(
|
||||
|
||||
query += `UPDATE post_aggregates SET score=${post.score} WHERE post_id=root_post_id;\n`
|
||||
|
||||
// Traverse again but INSERT this time (this could probably be a function)
|
||||
@ -220,7 +237,6 @@ function writeSql(post, targetCommName, targetUserName) {
|
||||
const current = stack[index]; // Get the current item to process
|
||||
index++; // Move the index forward
|
||||
|
||||
console.log("bye")
|
||||
// Insert the current comment
|
||||
query +=
|
||||
`INSERT INTO comment (creator_id, post_id, content, published, updated) ` +
|
||||
|
Loading…
x
Reference in New Issue
Block a user