fix annoying edge case shit and reduce json file size a bit
This commit is contained in:
parent
3925c2f8ed
commit
7209e75c6b
4
.gitignore
vendored
4
.gitignore
vendored
@ -1,6 +1,6 @@
|
||||
dummy/
|
||||
node_modules/
|
||||
*.sql
|
||||
*.sql*
|
||||
ok
|
||||
src/test.js
|
||||
*-threads.json
|
||||
*-threads.json*
|
@ -1,6 +1,8 @@
|
||||
# reddit-lemmy-importer
|
||||
turn json files downloaded from https://the-eye.eu/redarcs/ into lemmy comms :D
|
||||
|
||||
*Note: these archives are only 12/31/2022 and before*
|
||||
|
||||
this is effectively https://github.com/mesmere/RedditLemmyImporter but in js and for a different type of archive
|
||||
|
||||
the posts/comments dump is read as a stream so handling bigger subreddits is less ram-intensive (though the final tree will still take up a good amount of ram so maybe create a big swapfile if processing large subreddits)
|
||||
@ -9,6 +11,8 @@ the posts/comments dump is read as a stream so handling bigger subreddits is les
|
||||
|
||||
You can build the SQL script before making the comm/user though.
|
||||
|
||||
|
||||
|
||||
## usage:
|
||||
install dependencies
|
||||
|
||||
@ -33,7 +37,7 @@ option 3: import your sql to a docker container
|
||||
## TODO:
|
||||
- set URL embed titles/descriptions and url_content type and embed_video_url in posts
|
||||
|
||||
- FIX ap_id!!!!!
|
||||
- FIX ap_id!!!!! (may not be needed, see lemmy/src/code_migrations.rs function post_updates_2020_04_03 and comment_updates_2020_04_03)
|
||||
|
||||
- - this could be done by taking the federated url as an argument then updating the ap_id using [the url + /type/ + sql id from the post]
|
||||
|
||||
|
@ -91,7 +91,7 @@ async function unflatten(postsFile, commentsFile) {
|
||||
//console.log(threadCounter)
|
||||
//threadCounter == 467 ? console.log( `${threadCounter} ${thread}`) : null
|
||||
} catch (error) {
|
||||
console.error('Error processing post:', error);
|
||||
console.error(`Error processing post ${thread.value.name}:`, error);
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -58,8 +58,9 @@ async function processPostsAndComments(postsFile, commentsFile) {
|
||||
const context = new ProcessingContext();
|
||||
|
||||
// allowed fields
|
||||
// reduced the size of my dummy json to about 31% of its size without the filters
|
||||
const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "edited", "gallery_data", "is_gallery", "is_self", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit", "title", "url"];
|
||||
// reduced the size of my dummy json to about 22% of its original size without the filters
|
||||
// also makes writing sql a couple seconds faster since its reading less bullshit from the disk
|
||||
const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "crosspost_parent_list", "edited", "gallery_data", "is_gallery", "is_self", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit", "title", "url"];
|
||||
const commentKeysAllowed = ["author", "author_flair_text", "body", "created_utc", "link_id", "name", "parent_id", "score", "subreddit"];
|
||||
|
||||
// Process posts first
|
||||
@ -78,6 +79,19 @@ async function processPostsAndComments(postsFile, commentsFile) {
|
||||
}
|
||||
|
||||
post = filterJsonKeys(post, submissionKeysAllowed);
|
||||
|
||||
// clear unused fields + large array of unused thumbnails (best quality is used instead)
|
||||
if(post?.gallery_data?.items) { post.gallery_data.items.forEach(item => {
|
||||
item?.id ? delete item.id : null;
|
||||
if(post?.media_metadata?.[item?.media_id]) {
|
||||
delete post.media_metadata[item.media_id].m;
|
||||
delete post.media_metadata[item.media_id].p;
|
||||
}
|
||||
})}
|
||||
|
||||
// reduce crosspost size too
|
||||
post.crosspost_parent_list ? post.crosspost_parent_list.forEach(crosspost => crosspost = filterJsonKeys(post, submissionKeysAllowed)) : null;
|
||||
|
||||
context.processItem(post);
|
||||
//dbgpost++;
|
||||
}
|
||||
|
60
src/sql.js
60
src/sql.js
@ -55,8 +55,31 @@ function mkTitle(post) {
|
||||
|
||||
// wrap the url in singlequotes HERE and not in the query like '${lit(mkUrl(post))}'
|
||||
// this is because the null type in postgres will be turned into a string which will break lemmy until you remove the row or set it to null manually
|
||||
|
||||
// 1 line ternary operator gets obliterated thanks to 15,000,000 different edge cases
|
||||
function mkUrl(post) {
|
||||
return post.is_gallery ? `'${getSubmissionImages(post)[0]}'` : post.is_self ? 'null' : `'${post.url}'`
|
||||
if(post.is_gallery){
|
||||
// deleted galleries are "url": null and not 'null' in text
|
||||
// [0][0] means the first images url
|
||||
return post.url == null ? 'null' : `'${getSubmissionImages(post)[0][0]}'`;
|
||||
} else if(post.is_self) {
|
||||
return 'null'
|
||||
} else if(post.crosspost_parent_list) {
|
||||
// crosspost urls are just paths "/r/subreddit/id/...", so get the full url from the original post
|
||||
// deleted crossposts have an empty array
|
||||
if(post.crosspost_parent_list.length > 0) {
|
||||
if(post.is_gallery){
|
||||
return `'${getSubmissionImages(post.crosspost_parent_list[0])[0]}'`;
|
||||
} else {
|
||||
return `'${post.crosspost_parent_list[0].url}'`
|
||||
}
|
||||
} else {
|
||||
return 'null'
|
||||
}
|
||||
} else {
|
||||
return post.url == null || post.url == '' ? 'null' : `'${post.url}'`;
|
||||
}
|
||||
//return post.is_gallery ? `'${getSubmissionImages(post)[0]}'` : post.is_self ? 'null' : post.crosspost_parent_list ? `'${post.crosspost_parent_list[0].url}'` : `'${post.url}'`
|
||||
}
|
||||
|
||||
/**
|
||||
@ -86,9 +109,24 @@ function mkBody(post, postType) {
|
||||
// double newlines for reddit spacing is done at the START of the next addition, this way there arent trailing newlines if theres nothing else after.
|
||||
outputBody += [...post.title].length > 200 ? `\n\n\`Full title: ${post.title}\`` : ''
|
||||
|
||||
// i want to scream
|
||||
if (post.crosspost_parent_list && post.crosspost_parent_list.length > 0) {
|
||||
var crosspost = post.crosspost_parent_list[0];
|
||||
outputBody += `\n\nCrosspost:`
|
||||
|
||||
outputBody += `\n\n${mkBody(crosspost, "post")}`
|
||||
return outputBody;
|
||||
}
|
||||
|
||||
var postImages = getSubmissionImages(post);
|
||||
// add "Gallery links:" then all the image links as bullet points if the post is a gallery
|
||||
outputBody += postImages.length > 1 ? `\n\nGallery links:\n- ${post.url}${postImages.map(image => `\n\n- ${image}`).join('')}` : ''
|
||||
outputBody += postImages.length > 1 ? `\n\nGallery links:\n- ${post.url}${postImages.map(image => {
|
||||
if(image.length > 1) {
|
||||
// >1 means the image has a caption
|
||||
return `\n\n- ${image[0]} - "${image[1]}"`
|
||||
}
|
||||
return `\n\n- ${image[0]}`
|
||||
}).join('')}` : ''
|
||||
|
||||
// only if selftext exists, it wont exist if its an image post
|
||||
outputBody += post.selftext && post.selftext.trim() ? `\n\n` + post.selftext : ''
|
||||
@ -120,16 +158,27 @@ function getSubmissionImages(post) {
|
||||
// so we get s.u of the current key and push it to the array
|
||||
// if the file is an "AnimatedImage", then the link for the highest quality is in the "gif"/"mp4" field
|
||||
var file = post.media_metadata[fileIDs.media_id];
|
||||
var item = [];
|
||||
|
||||
if (file.status == "valid") {
|
||||
if (file.e == "Image"){
|
||||
gallery_collection.push(decodeHTML(file.s.u))
|
||||
item.push(decodeHTML(file.s.u))
|
||||
} else if (file.e == "AnimatedImage") {
|
||||
gallery_collection.push(decodeHTML(file.s.gif))
|
||||
item.push(decodeHTML(file.s.gif))
|
||||
} else {
|
||||
console.error(`post ${post.name} image ${JSON.stringify(fileIDs.media_id)} in media_metadata has an unknown filetype: ${file.e}`);
|
||||
}
|
||||
|
||||
// if there was a known image type
|
||||
if (item.length>0) {
|
||||
// add caption too if it exists
|
||||
if(fileIDs.caption) {
|
||||
item.push(fileIDs.caption)
|
||||
}
|
||||
|
||||
gallery_collection.push(item);
|
||||
}
|
||||
|
||||
} /*else if (file.status == "failed") {
|
||||
|
||||
}*/
|
||||
@ -141,7 +190,6 @@ function getSubmissionImages(post) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Ltrees ("path" column in the "comment" table) in lemmy: 0.comment.reply.[...].reply
|
||||
// where comment and reply are ids in the comment table
|
||||
|
||||
@ -267,4 +315,4 @@ function writeSql(post, targetCommName, targetUserName) {
|
||||
return `DO $$ ${query} $$;\n\n`
|
||||
}
|
||||
|
||||
export { writeSql }
|
||||
export { writeSql, mkUrl }
|
Loading…
x
Reference in New Issue
Block a user