pokerogue/scripts/scrape-trainer-names/main.js

296 lines
9.7 KiB
JavaScript

import { existsSync, writeFileSync } from "node:fs";
import { format, inspect } from "node:util";
import chalk from "chalk";
import inquirer from "inquirer";
import { JSDOM } from "jsdom";
import { toCamelCase, toPascalSnakeCase, toTitleCase } from "../helpers/strings.js";
import { checkGenderAndType } from "./check-gender.js";
import { fetchNames, INVALID_URL } from "./fetch-names.js";
import { showHelpText } from "./help-message.js";
/**
* @packageDocumentation
* This script will scrape Bulbapedia for the English names of a given trainer class,
* outputting them as JSON.
* Usage: `pnpm scrape-trainers`
*/
/**
* @import { parsedNames } from "./types.js"
*/
const version = "1.0.0";
const OUTFILE_ALIASES = /** @type {const} */ (["-o", "--outfile", "--outFile"]);
/**
* A large object mapping each "base" trainer name to a list of replacements.
* Used to allow for trainer classes with different `TrainerType`s than in mainline.
* @type {Record<string, string[]>}
*/
const trainerNamesMap = {
pokemonBreeder: ["breeder"],
worker: ["worker", "snowWorker"],
richBoy: ["richKid"],
gentleman: ["rich"],
};
async function main() {
console.log(chalk.hex("#FF7F50")(`🍳 Trainer Name Scraper v${version}`));
const args = process.argv.slice(2);
const out = getOutfile(args);
// Break out if no args remain
if (args.length === 0) {
console.error(
chalk.red.bold(
`✗ Error: No trainer classes provided!\nArgs: ${chalk.hex("#7310fdff")(process.argv.slice(2).join(", "))}`,
),
);
showHelpText();
process.exitCode = 1;
return;
}
const output = await scrapeTrainerNames(args);
await tryWriteFile(out, output);
}
/**
* Get the outfile location from the args array.
* @param {string[]} args - The command line arguments
* @returns {string | undefined} The outfile location, or `undefined` if none is provided
* @remarks
* This will mutate the `args` array by removing the outfile from the list of arguments.
*/
function getOutfile(args) {
let /** @type {string} */ outFile;
// Extract the outfile as either the form "-o=y" or "-o y".
const hasEquals = /^.*=(.+)$/g.exec(args[0]);
if (hasEquals) {
outFile = hasEquals[1];
args.splice(0, 1);
} else if (/** @type {readonly string[]} */ (OUTFILE_ALIASES).includes(args[0])) {
outFile = args[1];
args.splice(0, 2);
} else {
console.log(chalk.hex("#ffa500")("No outfile detected, logging to stdout..."));
return;
}
console.log(chalk.hex("#ffa500")(`Using outfile: ${chalk.blue(outFile)}`));
return outFile;
}
/**
* Scrape the requested trainer names and format the resultant output.
* @param {string[]} classes The names of the trainer classes to retrieve
* @returns {Promise<string>} A Promise that resolves with the finished text.
*/
async function scrapeTrainerNames(classes) {
classes = [...new Set(classes)];
/**
* A Set containing all trainer URLs that have been seen.
* @type {Set<string>}
*/
const seenClasses = new Set();
/**
* A large array of tuples matching each class to their corresponding list of trainer names. \
* Trainer classes with only 1 gender will only contain the single array for that gender.
* @type {[keyName: string, names: string[] | parsedNames][]}
*/
const namesTuples = await Promise.all(
classes.map(async trainerClass => {
try {
const [trainerName, names] = await doFetch(trainerClass, seenClasses);
const namesObj = names.female.length === 0 ? names.male : names;
return /** @type {const} */ ([trainerName, namesObj]);
} catch (e) {
if (!(e instanceof Error)) {
throw new Error(chalk.red.bold("Unrecognized error detected:", inspect(e)));
}
// If the error contains an HTTP status, attempt to parse the code to give a more friendly
// response than JSDOM's "Resource was not loaded"gi
const errCode = /Status: (\d*)/g.exec(e.message)?.[1];
if (!errCode) {
throw e;
}
/** @type {string} */
let reason;
switch (+errCode) {
case 404:
reason = "Page not found";
break;
case 403:
reason = "Access is forbidden";
break;
default:
reason = `Server produced error code of ${+errCode}`;
}
throw new Error(
chalk.red.bold(`Failed to parse URL for ${chalk.hex("#7fff00")(`\"${trainerClass}\"`)}!\nReason: ${reason}`),
);
}
}),
);
// Grab all keys inside the name replacement map and change them accordingly.
const mappedNames = namesTuples.filter(tuple => tuple[0] in trainerNamesMap);
for (const mappedName of mappedNames) {
const namesMapping = trainerNamesMap[mappedName[0]];
namesTuples.splice(
namesTuples.indexOf(mappedName),
1,
...namesMapping.map(
name => /** @type {[keyName: string, names: parsedNames | string[]]} */ ([name, mappedName[1]]),
),
);
}
namesTuples.sort((a, b) => a[0].localeCompare(b[0]));
/** @type {Record<string, string[] | parsedNames>} */
const namesRecord = Object.fromEntries(namesTuples);
// Convert all arrays into objects indexed by numbers
return JSON.stringify(
namesRecord,
(_, v) => {
if (Array.isArray(v)) {
return v.reduce((ret, curr, i) => {
ret[i + 1] = curr; // 1 indexed
return ret;
}, {});
}
return v;
},
2,
);
}
/**
* Recursively scrape names from a given Trainer class and its gender counterparts.
* @param {string} trainerClass - The URL to parse
* @param {Set<string>} seenClasses - A Set containing all seen class URLs, used for record keeping.
* @returns {Promise<[string, parsedNames]>}
* A Promise that resolves with:
* 1. The name to use for the key.
* 2. All fetched names for this trainer class and its gender variants.
*/
async function doFetch(trainerClass, seenClasses) {
let keyName = toCamelCase(trainerClass);
// Bulba URLs are in Pascal_Snake_Case (Pokemon_Breeder)
const classURL = toPascalSnakeCase(trainerClass);
seenClasses.add(classURL);
// Bulbapedia has redirects mapping basically all variant spellings of each trainer name to the corresponding main page.
// We thus rely on it
const { document } = (await JSDOM.fromURL(`https://bulbapedia.bulbagarden.net/wiki/${classURL}`)).window;
const trainerListHeader = document.querySelector("#Trainer_list")?.parentElement;
const [female, counterpartURLs] = checkGenderAndType(document);
const names = fetchNames(trainerListHeader, female);
if (names === INVALID_URL) {
return Promise.reject(
new Error(chalk.red.bold(`URL \"${classURL}\" did not correspond to a valid trainer class!`)),
);
}
// Recurse into all unseen gender counterparts' URLs, using the first male name we find
const counterpartNames = await Promise.all(
counterpartURLs
.filter(url => !seenClasses.has(url))
.map(counterpartURL => {
console.log(chalk.green(`Accessing gender counterpart URL: ${toTitleCase(counterpartURL)}`));
return doFetch(counterpartURL, seenClasses);
}),
);
let overrodeName = false;
for (const [cKeyName, cNameObj] of counterpartNames) {
if (!overrodeName && female) {
overrodeName = true;
console.log(chalk.green(`Using "${cKeyName}" as the name of the JSON key object...`));
keyName = cKeyName;
}
names.male = [...new Set(names.male.concat(cNameObj.male))];
names.female = [...new Set(names.female.concat(cNameObj.female))];
}
return [normalizeDiacritics(keyName), names];
}
/**
* Convert all diacritical marks within a string into their normalized variants.
* @param {string} str - The string to parse
* @returns {string} The string with normalized diacritics
*/
function normalizeDiacritics(str) {
// Normalizing to NFKD splits all diacritics into the base letter + grapheme (à -> a + `),
// which are conveniently all in their own little Unicode block for easy removal
return str.normalize("NFKD").replace(/[\u0300-\u036f]/g, "");
}
/**
* Try to write the output to a file (or log it to stdout, as the case may be).
* @param {string | undefined} outFile - The outfile
* @param {string} output - The scraped output to produce
*/
async function tryWriteFile(outFile, output) {
if (!outFile) {
console.log(output);
return;
}
if (existsSync(outFile) && !(await promptExisting(outFile))) {
process.exitCode = 1;
return;
}
try {
writeFileSync(outFile, output);
console.log(chalk.green.bold(`✔ Output written to ${chalk.blue(outFile)} successfully!`));
} catch (e) {
let /** @type {string} */ errStr;
if (!(e instanceof Error)) {
errStr = format("Unknown error occurred: ", e);
} else {
// @ts-expect-error - Node.JS file errors always have codes
switch (e.code) {
case "ENOENT":
errStr = `File not found: ${outFile}`;
break;
case "EACCES":
errStr = `Could not write ${outFile}: Permission denied`;
break;
case "EISDIR":
errStr = `Unable to write to ${outFile} as it is a directory`;
break;
default:
errStr = `Error writing file: ${e.message}`;
}
}
console.error(chalk.red.bold(errStr));
process.exitCode = 1;
return;
}
}
/**
* Confirm overwriting an already-existing file.
* @param {string} outFile - The outfile
* @returns {Promise<boolean>} Whether "Yes" or "No" was selected.
*/
async function promptExisting(outFile) {
return (
await inquirer.prompt([
{
type: "confirm",
name: "continue",
message: `File ${chalk.blue(outFile)} already exists!` + "\nDo you want to replace it?",
default: false,
},
])
).continue;
}
main();