From a64550953a88a8c069f38597f882312382eed9dd Mon Sep 17 00:00:00 2001 From: Alexander Sullivan Date: Fri, 9 Aug 2024 14:00:10 -0400 Subject: [PATCH] Moved `getPMCList` to own file --- src/index.js | 3 +- src/scripts/data-retrieval.js | 61 ------------------ src/scripts/get-pmc-list.js | 63 +++++++++++++++++++ ...retrieval.test.js => get-pmc-list.test.js} | 2 +- 4 files changed, 66 insertions(+), 63 deletions(-) create mode 100644 src/scripts/get-pmc-list.js rename src/scripts/{data-retrieval.test.js => get-pmc-list.test.js} (95%) diff --git a/src/index.js b/src/index.js index 279baf4..48ea706 100644 --- a/src/index.js +++ b/src/index.js @@ -5,7 +5,8 @@ import "dotenv/config.js"; import * as fs from "fs"; import lodash from "lodash"; import throttledQueue from "throttled-queue"; -import { getPMCList, retrieveFigures } from "./scripts/data-retrieval.js"; +import { retrieveFigures } from "./scripts/data-retrieval.js"; +import { getPMCList } from "./scripts/get-pmc-list.js"; /** Throttled queue for ENTREZ API requests (1 per second) */ const throttle = throttledQueue(1, 1000); diff --git a/src/scripts/data-retrieval.js b/src/scripts/data-retrieval.js index 672af2a..6fd9f7c 100644 --- a/src/scripts/data-retrieval.js +++ b/src/scripts/data-retrieval.js @@ -8,67 +8,6 @@ const throttle = throttledQueue(1, 1000); /** Throttled queue for image downloading (2 per second) */ const throttleImages = throttledQueue(2, 1000); -/** - * Retrieve PMCs from NCBI's/NIHs ENTREZ database. - * @param {String} species Species name(s) to search for (default "Arabidopsis thaliana") - * @param {Number} maxIDs Maximum number of IDs to return (default 10000000) - * @returns {Array} List of PMCs - */ -export async function getPMCList(species = "Arabidopsis thaliana", maxIDs = 10000000) { - // Clean arguments - species = species.trim().split(" ").join("_"); - maxIDs = Number.isNaN(maxIDs) ? 10000000 : maxIDs; - - // Build API URL - /** ENTREZ's esearch */ - const base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"; - /** Database */ - const db = "db=pmc&"; - /** What is being searched for */ - const term = `term=${species}[Organism]&`; - /** Maximum number of publications */ - const retmax = `retmax=${maxIDs}`; - - /** API's URL */ - const url = base + db + term + retmax; - - // console feedback - console.log(`Retrieving ${species.split("_").join(" ")} PMCs...`); - - // Get and return XML document/data - /** API's response */ - const response = await axios.get(url, { - responseType: "document", - headers: { - "Content-Type": "text/xml", - }, - params: { - retmode: "xml", - }, - }); - /** API's data */ - const data = response?.data; - - // Go through XML and extract PMCs under eSEarchResult -> IdList -> Id - /** PMC list */ - let pmcList = []; - - if (data) { - // Parse the XML string with JSDOM - /** JSDOM document */ - const dom = new JSDOM(data, { contentType: "text/xml" }); - - // Use JSDOM to parse XML and get all PMCs - pmcList = Array.from(dom.window.document.querySelectorAll("Id")).map((id) => id.textContent); - - console.log(`Found ${pmcList.length} PMCs for ${species.split("_").join(" ")}...`); - return pmcList; - } - - console.error("No data found..."); - return pmcList; -} - /** All data that has already been retrieved */ let dataRetrieved; /** All publications that failed to be scraped or had some issue with them */ diff --git a/src/scripts/get-pmc-list.js b/src/scripts/get-pmc-list.js new file mode 100644 index 0000000..c76a5bc --- /dev/null +++ b/src/scripts/get-pmc-list.js @@ -0,0 +1,63 @@ +import axios from "axios"; +import { JSDOM } from "jsdom"; + +/** + * Retrieve PMCs from NCBI's/NIHs ENTREZ database. + * @param {String} species Species name(s) to search for (default "Arabidopsis thaliana") + * @param {Number} maxIDs Maximum number of IDs to return (default 10000000) + * @returns {Array} List of PMCs + */ +export async function getPMCList(species = "Arabidopsis thaliana", maxIDs = 10000000) { + // Clean arguments + species = species.trim().split(" ").join("_"); + maxIDs = Number.isNaN(maxIDs) ? 10000000 : maxIDs; + + // Build API URL + /** ENTREZ's esearch */ + const base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"; + /** Database */ + const db = "db=pmc&"; + /** What is being searched for */ + const term = `term=${species}[Organism]&`; + /** Maximum number of publications */ + const retmax = `retmax=${maxIDs}`; + + /** API's URL */ + const url = base + db + term + retmax; + + // console feedback + console.log(`Retrieving ${species.split("_").join(" ")} PMCs...`); + + // Get and return XML document/data + /** API's response */ + const response = await axios.get(url, { + responseType: "document", + headers: { + "Content-Type": "text/xml", + }, + params: { + retmode: "xml", + }, + }); + /** API's data */ + const data = response?.data; + + // Go through XML and extract PMCs under eSEarchResult -> IdList -> Id + /** PMC list */ + let pmcList = []; + + if (data) { + // Parse the XML string with JSDOM + /** JSDOM document */ + const dom = new JSDOM(data, { contentType: "text/xml" }); + + // Use JSDOM to parse XML and get all PMCs + pmcList = Array.from(dom.window.document.querySelectorAll("Id")).map((id) => id.textContent); + + console.log(`Found ${pmcList.length} PMCs for ${species.split("_").join(" ")}...`); + return pmcList; + } + + console.error("No data found..."); + return pmcList; +} diff --git a/src/scripts/data-retrieval.test.js b/src/scripts/get-pmc-list.test.js similarity index 95% rename from src/scripts/data-retrieval.test.js rename to src/scripts/get-pmc-list.test.js index d613547..2828fc9 100644 --- a/src/scripts/data-retrieval.test.js +++ b/src/scripts/get-pmc-list.test.js @@ -1,5 +1,5 @@ import axios from "axios"; -import { getPMCList } from "./data-retrieval"; +import { getPMCList } from "./get-pmc-list"; // Mock the axios module jest.mock("axios");