Compare commits

..

2 Commits

7 changed files with 78 additions and 18 deletions

2
.gitignore vendored
View File

@ -3,4 +3,4 @@ node_modules/
*.sql
ok
src/test.js
processed-threads.json
*-threads.json

View File

@ -22,9 +22,9 @@ You can build the SQL script before making the comm/user though.
- - since right now it just changes the upvotes to be negative or whatever the score is
- Remove the json fields from comments and posts that don't get used for importing, so that the final array of trees of posts/nested comments takes up less memory/space.
- Remove the json fields from comments and posts that don't get used for importing, so that the final array of trees of posts/nested comments takes up less memory/space.
- Save the final json to a file and read it through StreamArray, so that the memory is freed once the archive has finished processing
- Save the final json to a file and read it through StreamArray, so that the memory is freed once the archive has finished processing
## references

View File

@ -10,6 +10,7 @@
"chalk": "^5.3.0",
"he": "^1.2.0",
"moment": "^2.30.1",
"stream-json": "^1.9.1",
"yargs": "^17.7.2"
},
"scripts": {

View File

@ -2,20 +2,22 @@ import { processPostsAndComments } from './parser.js';
import { writeSql } from './sql.js';
import { join } from "node:path";
// es6 >:(
// es6 path >:(
import path from 'path';
import {
fileURLToPath
} from 'url';
import { exists, existsSync, writeFileSync, appendFileSync } from 'node:fs';
export const __filename = fileURLToPath(
import.meta.url);
export const __dirname = path.dirname(__filename);
import { existsSync, writeFileSync, createReadStream, createWriteStream } from 'node:fs';
import yargs from 'yargs';
import streamArray from 'stream-json/streamers/StreamArray.js';
// https://github.com/yargs/yargs/blob/main/docs/examples.md section "Yargs is here to help you..."
var args = yargs(process.argv.slice(2))
.alias('c', 'comm')
@ -30,7 +32,7 @@ var args = yargs(process.argv.slice(2))
})
.string(['comm', 'user', 'output', 'posts', 'comments'])
.nargs(['comm', 'user', 'output', 'posts', 'comments'], 1)
.demandOption(['comm', 'user', 'output', 'posts', 'comments'])
.demandOption(['comm', 'user', 'posts', 'comments'])
.help('h')
.alias('h', 'help')
.epilog("git: https://git.stardust.wtf/starlight/reddit-lemmy-importer")
@ -57,18 +59,44 @@ function printThreadStructure(thread, level = 0) {
async function unflatten(postsFile, commentsFile) {
try {
const result = await processPostsAndComments(postsFile, commentsFile);
var result = await processPostsAndComments(postsFile, commentsFile);
//console.log('Thread Structure:');
//printThreadStructure(result);
// Optional: write the result to a file
//writeFileSync('processed-threads.json', JSON.stringify(result, null, 2));
const subredditName = result[0].subreddit_name_prefixed.slice(2);
const resultOutput = `${subredditName}-threads.json`;
// empty the file if it exists
// Write the result to a file
writeFileSync(resultOutput, JSON.stringify(result, null, 2));
result = {};
// read the threads through a stream
const pipeline = createReadStream(resultOutput).pipe(streamArray.withParser());
// empty the sql file if it exists
existsSync(args.output) ? writeFileSync(args.output, '') : null
result.forEach(post => {
// write the sql to the user specified path, or use [subredditname].sql if none is chosen
const sqlOutput = createWriteStream(args.output?.trim() ? args.output : `${subredditName}.sql`, {flags: "a"});
var threadCounter = 0;
// create an sql query to make a lemmy thread for each json object
pipeline.on('data', (thread) => {
sqlOutput.write(writeSql(thread.value, args.comm, args.user));
threadCounter++;
});
// close the stream and say how many threads are processed and where its saved
pipeline.on('end', () => {
sqlOutput.close();
console.log(`Finished processing ${threadCounter} threads, sql saved to ${sqlOutput.path}.`);
});
// old
/* result.forEach(post => {
appendFileSync(args.output, writeSql(post, args.comm, args.user))
})
}) */
} catch (error) {
console.error('Error processing files:', error);
}
@ -77,5 +105,4 @@ async function unflatten(postsFile, commentsFile) {
// Run the main function
unflatten(args.posts, args.comments);
//console.log("HOLY FUCKING SMOKES!" + existsSync(tree))
const outputPath = join(__dirname, '/', args.output);
//const outputPath = join(__dirname, '/', args.output);

View File

@ -41,8 +41,26 @@ class ProcessingContext {
}
}
// remove all fields from jsonObj that arent in the allowedKeys array
function filterJsonKeys(jsonObj, allowedKeys) {
// Input validation
if (typeof jsonObj !== 'object' || jsonObj === null) {
throw new TypeError('Input must be an object');
}
return Object.fromEntries(
Object.entries(jsonObj)
.filter(([key]) => allowedKeys.includes(key))
);
}
async function processPostsAndComments(postsFile, commentsFile) {
const context = new ProcessingContext();
// allowed fields
// reduced the size of my dummy json to about 31% of its size without the filters
const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "edited", "gallery_data", "is_gallery", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit_name_prefixed", "title", "url"];
const commentKeysAllowed = ["author", "author_flair_text", "body", "created_utc", "link_id", "name", "parent_id", "score", "subreddit_name_prefixed"];
// Process posts first
const postsStream = createInterface({
@ -52,7 +70,7 @@ async function processPostsAndComments(postsFile, commentsFile) {
for await (const line of postsStream) {
if (line.trim()) {
const post = JSON.parse(line);
const post = filterJsonKeys(JSON.parse(line), submissionKeysAllowed);
context.processItem(post);
}
}
@ -65,7 +83,7 @@ async function processPostsAndComments(postsFile, commentsFile) {
for await (const line of commentsStream) {
if (line.trim()) {
const comment = JSON.parse(line);
const comment = filterJsonKeys(JSON.parse(line), commentKeysAllowed);
context.processItem(comment);
}
}

View File

@ -1,5 +1,7 @@
// shamelessly stolen code from https://github.com/mesmere/RedditLemmyImporter/blob/main/src/main/kotlin/write.kt
// also reading the lemmy schema in lemmy/crates/db_schema/src/schema.rs
// reads the created tree of a post and its comments and builds a json query to add it to your lemmy comm
import moment from 'moment';
import he from 'he';

View File

@ -75,6 +75,18 @@ require-directory@^2.1.1:
resolved "https://registry.yarnpkg.com/require-directory/-/require-directory-2.1.1.tgz#8c64ad5fd30dab1c976e2344ffe7f792a6a6df42"
integrity sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==
stream-chain@^2.2.5:
version "2.2.5"
resolved "https://registry.yarnpkg.com/stream-chain/-/stream-chain-2.2.5.tgz#b30967e8f14ee033c5b9a19bbe8a2cba90ba0d09"
integrity sha512-1TJmBx6aSWqZ4tx7aTpBDXK0/e2hhcNSTV8+CbFJtDjbb+I1mZ8lHit0Grw9GRT+6JbIrrDd8esncgBi8aBXGA==
stream-json@^1.9.1:
version "1.9.1"
resolved "https://registry.yarnpkg.com/stream-json/-/stream-json-1.9.1.tgz#e3fec03e984a503718946c170db7d74556c2a187"
integrity sha512-uWkjJ+2Nt/LO9Z/JyKZbMusL8Dkh97uUBTv3AJQ74y07lVahLY4eEFsPsE97pxYBwr8nnjMAIch5eqI0gPShyw==
dependencies:
stream-chain "^2.2.5"
string-width@^4.1.0, string-width@^4.2.0, string-width@^4.2.3:
version "4.2.3"
resolved "https://registry.yarnpkg.com/string-width/-/string-width-4.2.3.tgz#269c7117d27b05ad2e536830a8ec895ef9c6d010"