Compare commits
7 Commits
795edd7141
...
main
Author | SHA1 | Date | |
---|---|---|---|
7209e75c6b | |||
3925c2f8ed | |||
c76e3e7cbb | |||
7687c66092 | |||
ead0c22817 | |||
5a2e268f3c | |||
7b6b69141c |
4
.gitignore
vendored
4
.gitignore
vendored
@ -1,6 +1,6 @@
|
||||
dummy/
|
||||
node_modules/
|
||||
*.sql
|
||||
*.sql*
|
||||
ok
|
||||
src/test.js
|
||||
processed-threads.json
|
||||
*-threads.json*
|
31
README.md
31
README.md
@ -1,6 +1,8 @@
|
||||
# reddit-lemmy-importer
|
||||
turn json files downloaded from https://the-eye.eu/redarcs/ into lemmy comms :D
|
||||
|
||||
*Note: these archives are only 12/31/2022 and before*
|
||||
|
||||
this is effectively https://github.com/mesmere/RedditLemmyImporter but in js and for a different type of archive
|
||||
|
||||
the posts/comments dump is read as a stream so handling bigger subreddits is less ram-intensive (though the final tree will still take up a good amount of ram so maybe create a big swapfile if processing large subreddits)
|
||||
@ -9,10 +11,33 @@ the posts/comments dump is read as a stream so handling bigger subreddits is les
|
||||
|
||||
You can build the SQL script before making the comm/user though.
|
||||
|
||||
|
||||
|
||||
## usage:
|
||||
install dependencies
|
||||
|
||||
`yarn install`
|
||||
|
||||
run the program
|
||||
|
||||
`yarn run importer --posts example-submissions.json --comments example-comments.json -c example_archive -u archive_bot -o example.sql`
|
||||
|
||||
option 1: import your sql remotely
|
||||
|
||||
`psql --username=lemmy --dbname=lemmy --port=[lemmy-port] --host=[ip/host] --file=example.sql`
|
||||
|
||||
option 2: import your sql on the server
|
||||
|
||||
`psql --username=lemmy --dbname=lemmy --file=example.sql`
|
||||
|
||||
option 3: import your sql to a docker container
|
||||
|
||||
`<example.sql docker exec -i [container-id] psql --username=lemmy --dbname=lemmy`
|
||||
|
||||
## TODO:
|
||||
- set URL embed titles/descriptions and url_content type and embed_video_url in posts
|
||||
|
||||
- FIX ap_id!!!!!
|
||||
- FIX ap_id!!!!! (may not be needed, see lemmy/src/code_migrations.rs function post_updates_2020_04_03 and comment_updates_2020_04_03)
|
||||
|
||||
- - this could be done by taking the federated url as an argument then updating the ap_id using [the url + /type/ + sql id from the post]
|
||||
|
||||
@ -22,9 +47,9 @@ You can build the SQL script before making the comm/user though.
|
||||
|
||||
- - since right now it just changes the upvotes to be negative or whatever the score is
|
||||
|
||||
- Remove the json fields from comments and posts that don't get used for importing, so that the final array of trees of posts/nested comments takes up less memory/space.
|
||||
- Remove the json fields from comments and posts that don't get used for importing, so that the final array of trees of posts/nested comments takes up less memory/space. ✔
|
||||
|
||||
- Save the final json to a file and read it through StreamArray, so that the memory is freed once the archive has finished processing
|
||||
- Save the final json to a file and read it through StreamArray, so that the memory is freed once the archive has finished processing ✔
|
||||
|
||||
## references
|
||||
|
||||
|
@ -10,6 +10,7 @@
|
||||
"chalk": "^5.3.0",
|
||||
"he": "^1.2.0",
|
||||
"moment": "^2.30.1",
|
||||
"stream-json": "^1.9.1",
|
||||
"yargs": "^17.7.2"
|
||||
},
|
||||
"scripts": {
|
||||
|
63
src/index.js
63
src/index.js
@ -2,20 +2,22 @@ import { processPostsAndComments } from './parser.js';
|
||||
import { writeSql } from './sql.js';
|
||||
|
||||
import { join } from "node:path";
|
||||
// es6 >:(
|
||||
|
||||
// es6 path >:(
|
||||
import path from 'path';
|
||||
import {
|
||||
fileURLToPath
|
||||
} from 'url';
|
||||
|
||||
import { exists, existsSync, writeFileSync, appendFileSync } from 'node:fs';
|
||||
|
||||
export const __filename = fileURLToPath(
|
||||
import.meta.url);
|
||||
export const __dirname = path.dirname(__filename);
|
||||
|
||||
import { existsSync, writeFileSync, createReadStream, createWriteStream } from 'node:fs';
|
||||
|
||||
import yargs from 'yargs';
|
||||
|
||||
import streamArray from 'stream-json/streamers/StreamArray.js';
|
||||
|
||||
// https://github.com/yargs/yargs/blob/main/docs/examples.md section "Yargs is here to help you..."
|
||||
var args = yargs(process.argv.slice(2))
|
||||
.alias('c', 'comm')
|
||||
@ -30,7 +32,7 @@ var args = yargs(process.argv.slice(2))
|
||||
})
|
||||
.string(['comm', 'user', 'output', 'posts', 'comments'])
|
||||
.nargs(['comm', 'user', 'output', 'posts', 'comments'], 1)
|
||||
.demandOption(['comm', 'user', 'output', 'posts', 'comments'])
|
||||
.demandOption(['comm', 'user', 'posts', 'comments'])
|
||||
.help('h')
|
||||
.alias('h', 'help')
|
||||
.epilog("git: https://git.stardust.wtf/starlight/reddit-lemmy-importer")
|
||||
@ -57,18 +59,52 @@ function printThreadStructure(thread, level = 0) {
|
||||
|
||||
async function unflatten(postsFile, commentsFile) {
|
||||
try {
|
||||
const result = await processPostsAndComments(postsFile, commentsFile);
|
||||
var result = await processPostsAndComments(postsFile, commentsFile);
|
||||
//console.log('Thread Structure:');
|
||||
//printThreadStructure(result);
|
||||
|
||||
// Optional: write the result to a file
|
||||
//writeFileSync('processed-threads.json', JSON.stringify(result, null, 2));
|
||||
const resultOutput = `${args.comm}-threads.json`;
|
||||
const subredditName = result[0].subreddit
|
||||
const sqlOutputPath = args.output?.trim() ? args.output : `${subredditName}.sql`;
|
||||
|
||||
// empty the file if it exists
|
||||
existsSync(args.output) ? writeFileSync(args.output, '') : null
|
||||
result.forEach(post => {
|
||||
// Write the result to a file
|
||||
writeFileSync(resultOutput, JSON.stringify(result, null, 2));
|
||||
|
||||
// console.log(result[357])
|
||||
result = {};
|
||||
|
||||
// read the threads through a stream
|
||||
const pipeline = createReadStream(resultOutput).pipe(streamArray.withParser());
|
||||
|
||||
// empty the sql file if it exists
|
||||
existsSync(sqlOutputPath) ? writeFileSync(sqlOutputPath, '') : null
|
||||
// write the sql to the path
|
||||
const sqlOutput = createWriteStream(sqlOutputPath, {flags: "a"});
|
||||
|
||||
var threadCounter = 0;
|
||||
|
||||
// create an sql query to make a lemmy thread for each json object
|
||||
pipeline.on('data', (thread) => {
|
||||
try{
|
||||
sqlOutput.write(writeSql(thread.value, args.comm, args.user));
|
||||
threadCounter++;
|
||||
//console.log(threadCounter)
|
||||
//threadCounter == 467 ? console.log( `${threadCounter} ${thread}`) : null
|
||||
} catch (error) {
|
||||
console.error(`Error processing post ${thread.value.name}:`, error);
|
||||
}
|
||||
});
|
||||
|
||||
// close the stream and say how many threads are processed and where its saved
|
||||
pipeline.on('end', () => {
|
||||
sqlOutput.close();
|
||||
console.log(`Finished processing ${threadCounter} threads, sql saved to ${sqlOutput.path}.`);
|
||||
});
|
||||
|
||||
// old
|
||||
/* result.forEach(post => {
|
||||
appendFileSync(args.output, writeSql(post, args.comm, args.user))
|
||||
})
|
||||
}) */
|
||||
} catch (error) {
|
||||
console.error('Error processing files:', error);
|
||||
}
|
||||
@ -77,5 +113,4 @@ async function unflatten(postsFile, commentsFile) {
|
||||
// Run the main function
|
||||
unflatten(args.posts, args.comments);
|
||||
|
||||
//console.log("HOLY FUCKING SMOKES!" + existsSync(tree))
|
||||
const outputPath = join(__dirname, '/', args.output);
|
||||
//const outputPath = join(__dirname, '/', args.output);
|
@ -41,8 +41,27 @@ class ProcessingContext {
|
||||
}
|
||||
}
|
||||
|
||||
// remove all fields from jsonObj that arent in the allowedKeys array
|
||||
function filterJsonKeys(jsonObj, allowedKeys) {
|
||||
// Input validation
|
||||
if (typeof jsonObj !== 'object' || jsonObj === null) {
|
||||
throw new TypeError('Input must be an object');
|
||||
}
|
||||
|
||||
return Object.fromEntries(
|
||||
Object.entries(jsonObj)
|
||||
.filter(([key]) => allowedKeys.includes(key))
|
||||
);
|
||||
}
|
||||
|
||||
async function processPostsAndComments(postsFile, commentsFile) {
|
||||
const context = new ProcessingContext();
|
||||
|
||||
// allowed fields
|
||||
// reduced the size of my dummy json to about 22% of its original size without the filters
|
||||
// also makes writing sql a couple seconds faster since its reading less bullshit from the disk
|
||||
const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "crosspost_parent_list", "edited", "gallery_data", "is_gallery", "is_self", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit", "title", "url"];
|
||||
const commentKeysAllowed = ["author", "author_flair_text", "body", "created_utc", "link_id", "name", "parent_id", "score", "subreddit"];
|
||||
|
||||
// Process posts first
|
||||
const postsStream = createInterface({
|
||||
@ -50,10 +69,31 @@ async function processPostsAndComments(postsFile, commentsFile) {
|
||||
crlfDelay: Infinity
|
||||
});
|
||||
|
||||
//var dbgpost = 0;
|
||||
for await (const line of postsStream) {
|
||||
if (line.trim()) {
|
||||
const post = JSON.parse(line);
|
||||
// i think this is only a problem for the comments (see below) but i did it here too as a safety measure
|
||||
var post = JSON.parse(line)
|
||||
if(!post.name){
|
||||
post.name = `t3_${post.id}`;
|
||||
}
|
||||
|
||||
post = filterJsonKeys(post, submissionKeysAllowed);
|
||||
|
||||
// clear unused fields + large array of unused thumbnails (best quality is used instead)
|
||||
if(post?.gallery_data?.items) { post.gallery_data.items.forEach(item => {
|
||||
item?.id ? delete item.id : null;
|
||||
if(post?.media_metadata?.[item?.media_id]) {
|
||||
delete post.media_metadata[item.media_id].m;
|
||||
delete post.media_metadata[item.media_id].p;
|
||||
}
|
||||
})}
|
||||
|
||||
// reduce crosspost size too
|
||||
post.crosspost_parent_list ? post.crosspost_parent_list.forEach(crosspost => crosspost = filterJsonKeys(post, submissionKeysAllowed)) : null;
|
||||
|
||||
context.processItem(post);
|
||||
//dbgpost++;
|
||||
}
|
||||
}
|
||||
|
||||
@ -65,7 +105,14 @@ async function processPostsAndComments(postsFile, commentsFile) {
|
||||
|
||||
for await (const line of commentsStream) {
|
||||
if (line.trim()) {
|
||||
const comment = JSON.parse(line);
|
||||
// dont filter yet so that we can have the id key
|
||||
var comment = JSON.parse(line)
|
||||
// if its a comment with no "name" then make a "name" field
|
||||
if(!comment.name){
|
||||
comment.name = `t1_${comment.id}`;
|
||||
}
|
||||
|
||||
comment = filterJsonKeys(comment, commentKeysAllowed);
|
||||
context.processItem(comment);
|
||||
}
|
||||
}
|
||||
|
92
src/sql.js
92
src/sql.js
@ -1,5 +1,7 @@
|
||||
// shamelessly stolen code from https://github.com/mesmere/RedditLemmyImporter/blob/main/src/main/kotlin/write.kt
|
||||
// also reading the lemmy schema in lemmy/crates/db_schema/src/schema.rs
|
||||
// reads the created tree of a post and its comments and builds a json query to add it to your lemmy comm
|
||||
|
||||
import moment from 'moment';
|
||||
import he from 'he';
|
||||
|
||||
@ -15,8 +17,10 @@ function lit(str) {
|
||||
} */
|
||||
|
||||
// decodeHTML then replace all instances of ' with ''
|
||||
// then escape $ as \$, since saying "$$$" (like money) will close the "DO $$"" statement :(
|
||||
|
||||
function lit(str) {
|
||||
return typeof str === 'string' ? decodeHTML(str).replace(/'/g, "''") : 'null'
|
||||
return typeof str === 'string' ? decodeHTML(str).replace(/'/g, "''").replace(/\$/g, "\\$") : 'null'
|
||||
}
|
||||
|
||||
// Decode HTML entities (e.g., '>' -> '>')
|
||||
@ -50,9 +54,32 @@ function mkTitle(post) {
|
||||
}
|
||||
|
||||
// wrap the url in singlequotes HERE and not in the query like '${lit(mkUrl(post))}'
|
||||
// this is because the null type in postgres will be turned into a string which will break lemmy until you remove the row or set it to null manually
|
||||
// this is because the null type in postgres will be turned into a string which will break lemmy until you remove the row or set it to null manually
|
||||
|
||||
// 1 line ternary operator gets obliterated thanks to 15,000,000 different edge cases
|
||||
function mkUrl(post) {
|
||||
return post.is_gallery ? `'${getSubmissionImages(post)[0]}'` : post.is_self ? 'null' : `'${post.url}'`
|
||||
if(post.is_gallery){
|
||||
// deleted galleries are "url": null and not 'null' in text
|
||||
// [0][0] means the first images url
|
||||
return post.url == null ? 'null' : `'${getSubmissionImages(post)[0][0]}'`;
|
||||
} else if(post.is_self) {
|
||||
return 'null'
|
||||
} else if(post.crosspost_parent_list) {
|
||||
// crosspost urls are just paths "/r/subreddit/id/...", so get the full url from the original post
|
||||
// deleted crossposts have an empty array
|
||||
if(post.crosspost_parent_list.length > 0) {
|
||||
if(post.is_gallery){
|
||||
return `'${getSubmissionImages(post.crosspost_parent_list[0])[0]}'`;
|
||||
} else {
|
||||
return `'${post.crosspost_parent_list[0].url}'`
|
||||
}
|
||||
} else {
|
||||
return 'null'
|
||||
}
|
||||
} else {
|
||||
return post.url == null || post.url == '' ? 'null' : `'${post.url}'`;
|
||||
}
|
||||
//return post.is_gallery ? `'${getSubmissionImages(post)[0]}'` : post.is_self ? 'null' : post.crosspost_parent_list ? `'${post.crosspost_parent_list[0].url}'` : `'${post.url}'`
|
||||
}
|
||||
|
||||
/**
|
||||
@ -72,7 +99,8 @@ function mkBody(post, postType) {
|
||||
outputBody += post.author_flair_text && post.author_flair_text.trim() ? ` - ${post.author_flair_text}` : ''
|
||||
|
||||
// add original subreddit
|
||||
outputBody += ` - originally from /${post.subreddit_name_prefixed}\``;
|
||||
// changed from post.subreddit_name_prefixed to post.subreddit since the former isnt in all posts
|
||||
outputBody += ` - originally from /r/${post.subreddit}\``;
|
||||
|
||||
// then add the post body if it has one.
|
||||
// comments use a different field for the body (.body and not .selftext)
|
||||
@ -81,15 +109,30 @@ function mkBody(post, postType) {
|
||||
// double newlines for reddit spacing is done at the START of the next addition, this way there arent trailing newlines if theres nothing else after.
|
||||
outputBody += [...post.title].length > 200 ? `\n\n\`Full title: ${post.title}\`` : ''
|
||||
|
||||
// i want to scream
|
||||
if (post.crosspost_parent_list && post.crosspost_parent_list.length > 0) {
|
||||
var crosspost = post.crosspost_parent_list[0];
|
||||
outputBody += `\n\nCrosspost:`
|
||||
|
||||
outputBody += `\n\n${mkBody(crosspost, "post")}`
|
||||
return outputBody;
|
||||
}
|
||||
|
||||
var postImages = getSubmissionImages(post);
|
||||
// add "Gallery links:" then all the image links as bullet points if the post is a gallery
|
||||
outputBody += postImages.length > 1 ? `\n\nGallery links:\n- ${post.url}${postImages.map(image => `\n\n- ${image}`).join('')}` : ''
|
||||
outputBody += postImages.length > 1 ? `\n\nGallery links:\n- ${post.url}${postImages.map(image => {
|
||||
if(image.length > 1) {
|
||||
// >1 means the image has a caption
|
||||
return `\n\n- ${image[0]} - "${image[1]}"`
|
||||
}
|
||||
return `\n\n- ${image[0]}`
|
||||
}).join('')}` : ''
|
||||
|
||||
// only if selftext exists, it wont exist if its an image post
|
||||
outputBody += post.selftext && post.selftext.trim() ? `\n\n` + post.selftext : ''
|
||||
} else if (postType == "comment") {
|
||||
outputBody += post.body && post.body.trim() ? `\n\n` + post.body : ''
|
||||
} else console.log("YOU FUCKED UP THE MKBODY CALL")
|
||||
} else console.error("YOU FUCKED UP THE MKBODY CALL")
|
||||
|
||||
return outputBody;
|
||||
}
|
||||
@ -100,19 +143,45 @@ function mkBody(post, postType) {
|
||||
*/
|
||||
function getSubmissionImages(post) {
|
||||
// is_gallery is when there is >1 image (shocker)
|
||||
if (post.is_gallery) {
|
||||
// deleted gallery posts have their gallery_data and media_metadata erased too
|
||||
if (post.is_gallery && post.gallery_data != null && post.media_metadata != null) {
|
||||
var gallery_collection = [];
|
||||
// iterate through JSON keys of gallery_data.
|
||||
// we need to get the keys/IDs from gallery_data since its items are in the proper order that the gallery is,
|
||||
// media_metadata sorts the images by alphanumerical order, and we want to preserve the proper order of the gallery
|
||||
// still, we use the keys we get from gallery_data to get the links from media_metadata
|
||||
post.gallery_data.items.forEach(image => {
|
||||
post.gallery_data.items.forEach(fileIDs => {
|
||||
// index media_metadata using the "media_id" of the current item we have iterated over in gallery_data
|
||||
// in the item we have indexed:
|
||||
// s = data for the best quality version of the image
|
||||
// s.u = the url of it
|
||||
// so we get s.u of the current key and push it to the array
|
||||
gallery_collection.push(decodeHTML(post.media_metadata[image.media_id].s.u));
|
||||
// if the file is an "AnimatedImage", then the link for the highest quality is in the "gif"/"mp4" field
|
||||
var file = post.media_metadata[fileIDs.media_id];
|
||||
var item = [];
|
||||
|
||||
if (file.status == "valid") {
|
||||
if (file.e == "Image"){
|
||||
item.push(decodeHTML(file.s.u))
|
||||
} else if (file.e == "AnimatedImage") {
|
||||
item.push(decodeHTML(file.s.gif))
|
||||
} else {
|
||||
console.error(`post ${post.name} image ${JSON.stringify(fileIDs.media_id)} in media_metadata has an unknown filetype: ${file.e}`);
|
||||
}
|
||||
|
||||
// if there was a known image type
|
||||
if (item.length>0) {
|
||||
// add caption too if it exists
|
||||
if(fileIDs.caption) {
|
||||
item.push(fileIDs.caption)
|
||||
}
|
||||
|
||||
gallery_collection.push(item);
|
||||
}
|
||||
|
||||
} /*else if (file.status == "failed") {
|
||||
|
||||
}*/
|
||||
});
|
||||
return gallery_collection;
|
||||
} else {
|
||||
@ -121,7 +190,6 @@ function getSubmissionImages(post) {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Ltrees ("path" column in the "comment" table) in lemmy: 0.comment.reply.[...].reply
|
||||
// where comment and reply are ids in the comment table
|
||||
|
||||
@ -207,6 +275,7 @@ function writeSql(post, targetCommName, targetUserName) {
|
||||
// post_aggregates is better to change anyway since its an INT8 and not an INT2, which means the score doesnt cap out at 2¹⁵-1, perhaps needing multiple entries for super popular posts
|
||||
// we use an UPDATE statement since there is already a row that gets made when we inserted the post prior
|
||||
// note: hovering over the upvote count will show the sad truth :(
|
||||
|
||||
query += `UPDATE post_aggregates SET score=${post.score} WHERE post_id=root_post_id;\n`
|
||||
|
||||
// Traverse again but INSERT this time (this could probably be a function)
|
||||
@ -218,7 +287,6 @@ function writeSql(post, targetCommName, targetUserName) {
|
||||
const current = stack[index]; // Get the current item to process
|
||||
index++; // Move the index forward
|
||||
|
||||
console.log("bye")
|
||||
// Insert the current comment
|
||||
query +=
|
||||
`INSERT INTO comment (creator_id, post_id, content, published, updated) ` +
|
||||
@ -247,4 +315,4 @@ function writeSql(post, targetCommName, targetUserName) {
|
||||
return `DO $$ ${query} $$;\n\n`
|
||||
}
|
||||
|
||||
export { writeSql }
|
||||
export { writeSql, mkUrl }
|
12
yarn.lock
12
yarn.lock
@ -75,6 +75,18 @@ require-directory@^2.1.1:
|
||||
resolved "https://registry.yarnpkg.com/require-directory/-/require-directory-2.1.1.tgz#8c64ad5fd30dab1c976e2344ffe7f792a6a6df42"
|
||||
integrity sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==
|
||||
|
||||
stream-chain@^2.2.5:
|
||||
version "2.2.5"
|
||||
resolved "https://registry.yarnpkg.com/stream-chain/-/stream-chain-2.2.5.tgz#b30967e8f14ee033c5b9a19bbe8a2cba90ba0d09"
|
||||
integrity sha512-1TJmBx6aSWqZ4tx7aTpBDXK0/e2hhcNSTV8+CbFJtDjbb+I1mZ8lHit0Grw9GRT+6JbIrrDd8esncgBi8aBXGA==
|
||||
|
||||
stream-json@^1.9.1:
|
||||
version "1.9.1"
|
||||
resolved "https://registry.yarnpkg.com/stream-json/-/stream-json-1.9.1.tgz#e3fec03e984a503718946c170db7d74556c2a187"
|
||||
integrity sha512-uWkjJ+2Nt/LO9Z/JyKZbMusL8Dkh97uUBTv3AJQ74y07lVahLY4eEFsPsE97pxYBwr8nnjMAIch5eqI0gPShyw==
|
||||
dependencies:
|
||||
stream-chain "^2.2.5"
|
||||
|
||||
string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3:
|
||||
version "4.2.3"
|
||||
resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"
|
||||
|
Reference in New Issue
Block a user