remove unused json fields when parsing to reduce memory/file size of the final json
This commit is contained in:
parent
7b6b69141c
commit
5a2e268f3c
@ -22,9 +22,9 @@ You can build the SQL script before making the comm/user though.
|
|||||||
|
|
||||||
- - since right now it just changes the upvotes to be negative or whatever the score is
|
- - since right now it just changes the upvotes to be negative or whatever the score is
|
||||||
|
|
||||||
- Remove the json fields from comments and posts that don't get used for importing, so that the final array of trees of posts/nested comments takes up less memory/space.
|
- Remove the json fields from comments and posts that don't get used for importing, so that the final array of trees of posts/nested comments takes up less memory/space. ✔
|
||||||
|
|
||||||
- Save the final json to a file and read it through StreamArray, so that the memory is freed once the archive has finished processing
|
- Save the final json to a file and read it through StreamArray, so that the memory is freed once the archive has finished processing ✔
|
||||||
|
|
||||||
## references
|
## references
|
||||||
|
|
||||||
|
15
src/index.js
15
src/index.js
@ -32,7 +32,7 @@ var args = yargs(process.argv.slice(2))
|
|||||||
})
|
})
|
||||||
.string(['comm', 'user', 'output', 'posts', 'comments'])
|
.string(['comm', 'user', 'output', 'posts', 'comments'])
|
||||||
.nargs(['comm', 'user', 'output', 'posts', 'comments'], 1)
|
.nargs(['comm', 'user', 'output', 'posts', 'comments'], 1)
|
||||||
.demandOption(['comm', 'user', 'output', 'posts', 'comments'])
|
.demandOption(['comm', 'user', 'posts', 'comments'])
|
||||||
.help('h')
|
.help('h')
|
||||||
.alias('h', 'help')
|
.alias('h', 'help')
|
||||||
.epilog("git: https://git.stardust.wtf/starlight/reddit-lemmy-importer")
|
.epilog("git: https://git.stardust.wtf/starlight/reddit-lemmy-importer")
|
||||||
@ -44,8 +44,6 @@ processPostsAndComments(args.posts, args.comments, (result) => {
|
|||||||
console.log(result)
|
console.log(result)
|
||||||
}); */
|
}); */
|
||||||
|
|
||||||
console.log(args.output?.trim())
|
|
||||||
|
|
||||||
function printThreadStructure(thread, level = 0) {
|
function printThreadStructure(thread, level = 0) {
|
||||||
thread.forEach(item => {
|
thread.forEach(item => {
|
||||||
var out = '';
|
var out = '';
|
||||||
@ -65,29 +63,34 @@ async function unflatten(postsFile, commentsFile) {
|
|||||||
//console.log('Thread Structure:');
|
//console.log('Thread Structure:');
|
||||||
//printThreadStructure(result);
|
//printThreadStructure(result);
|
||||||
|
|
||||||
const resultOutput = `${result[0].subreddit}-threads.json`;
|
const subredditName = result[0].subreddit_name_prefixed.slice(2);
|
||||||
|
const resultOutput = `${subredditName}-threads.json`;
|
||||||
|
|
||||||
// Write the result to a file
|
// Write the result to a file
|
||||||
writeFileSync(resultOutput, JSON.stringify(result, null, 2));
|
writeFileSync(resultOutput, JSON.stringify(result, null, 2));
|
||||||
|
|
||||||
result = {};
|
result = {};
|
||||||
|
|
||||||
|
// read the threads through a stream
|
||||||
const pipeline = createReadStream(resultOutput).pipe(streamArray.withParser());
|
const pipeline = createReadStream(resultOutput).pipe(streamArray.withParser());
|
||||||
|
|
||||||
// empty the sql file if it exists
|
// empty the sql file if it exists
|
||||||
existsSync(args.output) ? writeFileSync(args.output, '') : null
|
existsSync(args.output) ? writeFileSync(args.output, '') : null
|
||||||
const sqlOutput = createWriteStream(args.output, {flags: "a"});
|
// write the sql to the user specified path, or use [subredditname].sql if none is chosen
|
||||||
|
const sqlOutput = createWriteStream(args.output?.trim() ? args.output : `${subredditName}.sql`, {flags: "a"});
|
||||||
|
|
||||||
var threadCounter = 0;
|
var threadCounter = 0;
|
||||||
|
|
||||||
|
// create an sql query to make a lemmy thread for each json object
|
||||||
pipeline.on('data', (thread) => {
|
pipeline.on('data', (thread) => {
|
||||||
sqlOutput.write(writeSql(thread.value, args.comm, args.user));
|
sqlOutput.write(writeSql(thread.value, args.comm, args.user));
|
||||||
threadCounter++;
|
threadCounter++;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// close the stream and say how many threads are processed and where its saved
|
||||||
pipeline.on('end', () => {
|
pipeline.on('end', () => {
|
||||||
sqlOutput.close();
|
sqlOutput.close();
|
||||||
console.log(`Finished processing ${threadCounter} threads, sql saved to ${resultOutput}`);
|
console.log(`Finished processing ${threadCounter} threads, sql saved to ${sqlOutput.path}.`);
|
||||||
});
|
});
|
||||||
|
|
||||||
// old
|
// old
|
||||||
|
@ -41,8 +41,26 @@ class ProcessingContext {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// remove all fields from jsonObj that arent in the allowedKeys array
|
||||||
|
function filterJsonKeys(jsonObj, allowedKeys) {
|
||||||
|
// Input validation
|
||||||
|
if (typeof jsonObj !== 'object' || jsonObj === null) {
|
||||||
|
throw new TypeError('Input must be an object');
|
||||||
|
}
|
||||||
|
|
||||||
|
return Object.fromEntries(
|
||||||
|
Object.entries(jsonObj)
|
||||||
|
.filter(([key]) => allowedKeys.includes(key))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
async function processPostsAndComments(postsFile, commentsFile) {
|
async function processPostsAndComments(postsFile, commentsFile) {
|
||||||
const context = new ProcessingContext();
|
const context = new ProcessingContext();
|
||||||
|
|
||||||
|
// allowed fields
|
||||||
|
// reduced the size of my dummy json to about 31% of its size without the filters
|
||||||
|
const submissionKeysAllowed = ["author", "author_flair_text", "created_utc", "edited", "gallery_data", "is_gallery", "locked", "media_metadata", "name", "parent_id", "score", "selftext", "stickied", "subreddit_name_prefixed", "title", "url"];
|
||||||
|
const commentKeysAllowed = ["author", "author_flair_text", "body", "created_utc", "link_id", "name", "parent_id", "score", "subreddit_name_prefixed"];
|
||||||
|
|
||||||
// Process posts first
|
// Process posts first
|
||||||
const postsStream = createInterface({
|
const postsStream = createInterface({
|
||||||
@ -52,7 +70,7 @@ async function processPostsAndComments(postsFile, commentsFile) {
|
|||||||
|
|
||||||
for await (const line of postsStream) {
|
for await (const line of postsStream) {
|
||||||
if (line.trim()) {
|
if (line.trim()) {
|
||||||
const post = JSON.parse(line);
|
const post = filterJsonKeys(JSON.parse(line), submissionKeysAllowed);
|
||||||
context.processItem(post);
|
context.processItem(post);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -65,7 +83,7 @@ async function processPostsAndComments(postsFile, commentsFile) {
|
|||||||
|
|
||||||
for await (const line of commentsStream) {
|
for await (const line of commentsStream) {
|
||||||
if (line.trim()) {
|
if (line.trim()) {
|
||||||
const comment = JSON.parse(line);
|
const comment = filterJsonKeys(JSON.parse(line), commentKeysAllowed);
|
||||||
context.processItem(comment);
|
context.processItem(comment);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user