268 lines
13 KiB
JavaScript
268 lines
13 KiB
JavaScript
// shamelessly stolen code from https://github.com/mesmere/RedditLemmyImporter/blob/main/src/main/kotlin/write.kt
|
|
// also reading the lemmy schema in lemmy/crates/db_schema/src/schema.rs
|
|
// reads the created tree of a post and its comments and builds a json query to add it to your lemmy comm
|
|
|
|
import moment from 'moment';
|
|
import he from 'he';
|
|
|
|
/* // Utility function to escape values and wrap them in single quotes for SQL
|
|
function lit(str) {
|
|
if (typeof str === 'string') {
|
|
// Decode HTML entities for the content to display properly and escape special characters for SQL
|
|
let decodedStr = decodeHTML(str);
|
|
return `'${decodedStr.replace(/'/g, "''").replace(/\\/g, '\\\\')}'`; // Double escape single quotes and backslashes
|
|
} else {
|
|
return "NULL"; // Return 'NULL' if str is not a valid string
|
|
}
|
|
} */
|
|
|
|
// decodeHTML then replace all instances of ' with ''
|
|
function lit(str) {
|
|
return typeof str === 'string' ? decodeHTML(str).replace(/'/g, "''") : 'null'
|
|
}
|
|
|
|
// Decode HTML entities (e.g., '>' -> '>')
|
|
function decodeHTML(html) {
|
|
return he.decode(html); // Use the 'he' package to decode HTML entities
|
|
}
|
|
|
|
// Convert UTC Unix timestampt to Timestamptz in UTC (Postgres format)
|
|
// 1650751789 -> '2022-04-23 22:09:49' and NOT '2022-04-24 10:09:49' or anything else
|
|
// it'll match here on a browser where UTC is the default time (or if you're english)
|
|
// https://www.reddit.com/r/4tran/comments/uag3ji/screw_you_passoid/ example
|
|
// https://momentjs.com/docs/
|
|
// +00 is for the db to know it is in UTC
|
|
function toPostgresTimestamp(unixTimestamp) {
|
|
return `TIMESTAMP '${moment.unix(unixTimestamp).utc().format('YYYY-MM-DD HH:mm:ss')}+00'`;
|
|
}
|
|
|
|
// get timestamp for when the comment was edited or return null if it wasnt edited
|
|
function mkUpdated(post) {
|
|
// edited being a timestamp is considered truey
|
|
return post.edited ? toPostgresTimestamp(post.edited) : 'null'
|
|
}
|
|
|
|
function mkTitle(post) {
|
|
// i <3 es6 :)
|
|
// decodeHTML early since it changes length and we want to check against the final length
|
|
if ( [...decodeHTML(post.title)].length > 200) {
|
|
// truncate, turn back to string, + ...
|
|
return [...decodeHTML(post.title)].slice(0, 197).join("")+"..."
|
|
} else { return decodeHTML(post.title) }
|
|
}
|
|
|
|
// wrap the url in singlequotes HERE and not in the query like '${lit(mkUrl(post))}'
|
|
// this is because the null type in postgres will be turned into a string which will break lemmy until you remove the row or set it to null manually
|
|
function mkUrl(post) {
|
|
return post.is_gallery ? `'${getSubmissionImages(post)[0]}'` : post.is_self ? 'null' : `'${post.url}'`
|
|
}
|
|
|
|
/**
|
|
* @returns {string} a post body with user/subreddit info, and images based on the post type
|
|
* @param post - Your json object for a submission or comment
|
|
* @param postType - "post" or "comment", the types are handled differently
|
|
*/
|
|
function mkBody(post, postType) {
|
|
// will be the final string
|
|
var outputBody = "";
|
|
|
|
// add "/u/" assuming the user isnt deleted
|
|
// also add starting backtick (escaped) so its in a nice codeblock (differentiate from the actual post content)
|
|
outputBody += post.author == "[deleted]" ? `\`${post.author}` : `\`/u/${post.author}`
|
|
|
|
// add flair if it exists
|
|
outputBody += post.author_flair_text && post.author_flair_text.trim() ? ` - ${post.author_flair_text}` : ''
|
|
|
|
// add original subreddit
|
|
// changed from post.subreddit_name_prefixed to post.subreddit since the former isnt in all posts
|
|
outputBody += ` - originally from /r/${post.subreddit}\``;
|
|
|
|
// then add the post body if it has one.
|
|
// comments use a different field for the body (.body and not .selftext)
|
|
if (postType == "post") {
|
|
// add full title if truncated
|
|
// double newlines for reddit spacing is done at the START of the next addition, this way there arent trailing newlines if theres nothing else after.
|
|
outputBody += [...post.title].length > 200 ? `\n\n\`Full title: ${post.title}\`` : ''
|
|
|
|
var postImages = getSubmissionImages(post);
|
|
// add "Gallery links:" then all the image links as bullet points if the post is a gallery
|
|
outputBody += postImages.length > 1 ? `\n\nGallery links:\n- ${post.url}${postImages.map(image => `\n\n- ${image}`).join('')}` : ''
|
|
|
|
// only if selftext exists, it wont exist if its an image post
|
|
outputBody += post.selftext && post.selftext.trim() ? `\n\n` + post.selftext : ''
|
|
} else if (postType == "comment") {
|
|
outputBody += post.body && post.body.trim() ? `\n\n` + post.body : ''
|
|
} else console.error("YOU FUCKED UP THE MKBODY CALL")
|
|
|
|
return outputBody;
|
|
}
|
|
|
|
/**
|
|
* @returns {String[]} - the array which contains the urls of the best quality image versions, preserving the original order the post had
|
|
* @param post - Your json object for a submission
|
|
*/
|
|
function getSubmissionImages(post) {
|
|
// is_gallery is when there is >1 image (shocker)
|
|
// deleted gallery posts have their gallery_data and media_metadata erased too
|
|
if (post.is_gallery && post.gallery_data != null && post.media_metadata != null) {
|
|
var gallery_collection = [];
|
|
// iterate through JSON keys of gallery_data.
|
|
// we need to get the keys/IDs from gallery_data since its items are in the proper order that the gallery is,
|
|
// media_metadata sorts the images by alphanumerical order, and we want to preserve the proper order of the gallery
|
|
// still, we use the keys we get from gallery_data to get the links from media_metadata
|
|
post.gallery_data.items.forEach(fileIDs => {
|
|
// index media_metadata using the "media_id" of the current item we have iterated over in gallery_data
|
|
// in the item we have indexed:
|
|
// s = data for the best quality version of the image
|
|
// s.u = the url of it
|
|
// so we get s.u of the current key and push it to the array
|
|
// if the file is an "AnimatedImage", then the link for the highest quality is in the "gif"/"mp4" field
|
|
var file = post.media_metadata[fileIDs.media_id];
|
|
|
|
if (file.status == "valid") {
|
|
if (file.e == "Image"){
|
|
gallery_collection.push(decodeHTML(file.s.u))
|
|
} else if (file.e == "AnimatedImage") {
|
|
gallery_collection.push(decodeHTML(file.s.gif))
|
|
} else {
|
|
console.error(`post ${post.name} image ${JSON.stringify(fileIDs.media_id)} in media_metadata has an unknown filetype: ${file.e}`);
|
|
}
|
|
|
|
} /*else if (file.status == "failed") {
|
|
|
|
}*/
|
|
});
|
|
return gallery_collection;
|
|
} else {
|
|
// return the url of the post (which is the link to the image when not a gallery)
|
|
return [post.url];
|
|
}
|
|
}
|
|
|
|
|
|
// Ltrees ("path" column in the "comment" table) in lemmy: 0.comment.reply.[...].reply
|
|
// where comment and reply are ids in the comment table
|
|
|
|
/*
|
|
make functions for:
|
|
✔making a string literal to handle sql injection
|
|
✔handling a timestamp to work as a Timestamptz in UTC (e.g 2025-01-16 09:23:43.308171+00)
|
|
✔create the post body with selftext and additional info (like user info and flair)
|
|
✔if the post title is too long then add the full title to the body too (see line below)
|
|
✔also handle images in posts
|
|
✔if getSubmissionImages(post).length > 1:
|
|
✔forEach loop through the image links and add them to the post 'body'
|
|
✔handle the title by truncating max length of 200 Unicode code POINTS instead of UTF-16 code UNITS which the String: length property counts (important)
|
|
✔do the same with comment body
|
|
✔output the url to the first image in a gallery if the post is a gallery, if not, then output the url field if !post.is_self
|
|
|
|
|
|
✔for the sql create a function that takes a post object and:
|
|
✔DECLARE 3 variables as INTEGER:
|
|
✔comm_id, user_id, post_id
|
|
✔then DECLARE variables named {[post/comment].name}_id as INTEGER
|
|
✔do this by looping through every comment in the tree and do the post of course
|
|
✔BEGIN
|
|
✔SET comm_id and user_id based off their IDs in the database (community and person table) using the names given via user input here
|
|
✔insert a row into the "post" table using the data while RETURNING the id of the post INTO STRICT as post_id;
|
|
|
|
✔do the same in "post_aggregate" for the score minus the returning bit
|
|
|
|
✔loop through every comment in the tree
|
|
✔import the comment as you did for the post, dont specify a path and just let the db set a default for now
|
|
✔if its a top-level comment (parent_id and link_id will be the same), then leave it as is, since the defaulted path will be correct
|
|
✔if it has a parent, then set the path to [the path of its parent] + [.] + [its own id]
|
|
✔also, the tree must be looped through from the top-down, since you need to allocate ids for the highest comments first, since you wont be able to set the path right if a child is done first
|
|
✔END
|
|
|
|
execute the query like this https://www.postgresql.org/docs/current/sql-do.html
|
|
*/
|
|
|
|
/**
|
|
* @returns {string} an sql query for INSERTing the provided post and its entire comment tree into your lemmy database
|
|
* @param post - Your json object for a submission, it should have all of its comments in "children" (and ITS comments in "children" etc)
|
|
* @param targetCommName - name of the community you want the archive to be added to
|
|
* @param targetUserName - name of the user you want to create the archive under
|
|
*/
|
|
function writeSql(post, targetCommName, targetUserName) {
|
|
var query = '';
|
|
query += "DECLARE comm_id INTEGER;\nDECLARE user_id INTEGER;\nDECLARE root_post_id INTEGER;\n"
|
|
query += `DECLARE ${post.name}_id CONSTANT INTEGER := NULL;\n`
|
|
|
|
{
|
|
const stack = [...post.children || []]; // Create a shallow copy of the children array to keep the original intact
|
|
let index = 0; // Use an index pointer to track the position in the stack
|
|
|
|
while (index < stack.length) {
|
|
const current = stack[index]; // Get the current item to process
|
|
index++; // Move the index forward
|
|
|
|
// DECLARE an integer for the current comment
|
|
query += `DECLARE ${current.name}_id INTEGER;\n`;
|
|
|
|
// If it has children, add them to the stack
|
|
if (current.children) {
|
|
stack.push(...current.children); // Spread to add each child to the stack
|
|
}
|
|
}
|
|
}
|
|
|
|
query += "BEGIN\n"
|
|
|
|
// Get the IDs for the target comm and user
|
|
query += `SELECT id INTO STRICT comm_id FROM community WHERE name='${targetCommName}';\n`
|
|
query += `SELECT id INTO STRICT user_id FROM person WHERE name='${targetUserName}';\n`
|
|
|
|
// Insert the post.
|
|
query +=
|
|
`INSERT INTO post (name, url, body, creator_id, community_id, locked, published, featured_community) ` +
|
|
`VALUES ('${lit(mkTitle(post))}', ${mkUrl(post)}, '${lit(mkBody(post, "post"))}', user_id, comm_id, ${post.locked}, ${toPostgresTimestamp(post.created_utc)}, ${post.stickied}) ` +
|
|
`RETURNING id INTO STRICT root_post_id;\n`
|
|
|
|
// Likes
|
|
// We use post_aggregates and not post_like to change the score, unlike the kotlin RedditLemmyImporter
|
|
// This is because the like entries are now invalidated unless they are a score of -1 or 1 (likely because of the view upvotes feature? i'm probably wrong tho)
|
|
// post_aggregates is better to change anyway since its an INT8 and not an INT2, which means the score doesnt cap out at 2¹⁵-1, perhaps needing multiple entries for super popular posts
|
|
// we use an UPDATE statement since there is already a row that gets made when we inserted the post prior
|
|
// note: hovering over the upvote count will show the sad truth :(
|
|
|
|
query += `UPDATE post_aggregates SET score=${post.score} WHERE post_id=root_post_id;\n`
|
|
|
|
// Traverse again but INSERT this time (this could probably be a function)
|
|
{
|
|
const stack = [...post.children || []]; // Create a shallow copy of the children array to keep the original intact
|
|
let index = 0; // Use an index pointer to track the position in the stack
|
|
|
|
while (index < stack.length) {
|
|
const current = stack[index]; // Get the current item to process
|
|
index++; // Move the index forward
|
|
|
|
// Insert the current comment
|
|
query +=
|
|
`INSERT INTO comment (creator_id, post_id, content, published, updated) ` +
|
|
`VALUES (user_id, root_post_id, '${lit(mkBody(current, "comment"))}', ${toPostgresTimestamp(current.created_utc)}, ${mkUpdated(current)}) ` +
|
|
`RETURNING id INTO STRICT ${current.name}_id;\n`
|
|
|
|
// if the parent and root post aren't the same (i.e its not a top level comment)
|
|
if (current.link_id != current.parent_id) {
|
|
// update the path to be its parent path + its own id
|
|
// this works since postgres fills in the gaps, if you update it to its parents path, it will actually end up as [parents path + .ownID]
|
|
query += `UPDATE comment SET path = (SELECT path FROM comment WHERE id=${current.parent_id}_id) WHERE id=${current.name}_id;\n`
|
|
}
|
|
|
|
query += `UPDATE comment_aggregates SET upvotes=${current.score} WHERE comment_id=${current.name}_id;\n`
|
|
|
|
|
|
// If it has children, add them to the stack
|
|
if (current.children) {
|
|
stack.push(...current.children); // Spread to add each child to the stack
|
|
}
|
|
}
|
|
}
|
|
|
|
query += "END"
|
|
|
|
return `DO $$ ${query} $$;\n\n`
|
|
}
|
|
|
|
export { writeSql } |