Skip to content

Commit

Permalink
Moved getPMCList to own file
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexJSully committed Aug 9, 2024
1 parent b65846c commit a645509
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 63 deletions.
3 changes: 2 additions & 1 deletion src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ import "dotenv/config.js";
import * as fs from "fs";
import lodash from "lodash";
import throttledQueue from "throttled-queue";
import { getPMCList, retrieveFigures } from "./scripts/data-retrieval.js";
import { retrieveFigures } from "./scripts/data-retrieval.js";
import { getPMCList } from "./scripts/get-pmc-list.js";

/** Throttled queue for ENTREZ API requests (1 per second) */
const throttle = throttledQueue(1, 1000);
Expand Down
61 changes: 0 additions & 61 deletions src/scripts/data-retrieval.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,67 +8,6 @@ const throttle = throttledQueue(1, 1000);
/** Throttled queue for image downloading (2 per second) */
const throttleImages = throttledQueue(2, 1000);

/**
* Retrieve PMCs from NCBI's/NIHs ENTREZ database.
* @param {String} species Species name(s) to search for (default "Arabidopsis thaliana")
* @param {Number} maxIDs Maximum number of IDs to return (default 10000000)
* @returns {Array} List of PMCs
*/
export async function getPMCList(species = "Arabidopsis thaliana", maxIDs = 10000000) {
// Clean arguments
species = species.trim().split(" ").join("_");
maxIDs = Number.isNaN(maxIDs) ? 10000000 : maxIDs;

// Build API URL
/** ENTREZ's esearch */
const base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?";
/** Database */
const db = "db=pmc&";
/** What is being searched for */
const term = `term=${species}[Organism]&`;
/** Maximum number of publications */
const retmax = `retmax=${maxIDs}`;

/** API's URL */
const url = base + db + term + retmax;

// console feedback
console.log(`Retrieving ${species.split("_").join(" ")} PMCs...`);

// Get and return XML document/data
/** API's response */
const response = await axios.get(url, {
responseType: "document",
headers: {
"Content-Type": "text/xml",
},
params: {
retmode: "xml",
},
});
/** API's data */
const data = response?.data;

// Go through XML and extract PMCs under eSEarchResult -> IdList -> Id
/** PMC list */
let pmcList = [];

if (data) {
// Parse the XML string with JSDOM
/** JSDOM document */
const dom = new JSDOM(data, { contentType: "text/xml" });

// Use JSDOM to parse XML and get all PMCs
pmcList = Array.from(dom.window.document.querySelectorAll("Id")).map((id) => id.textContent);

console.log(`Found ${pmcList.length} PMCs for ${species.split("_").join(" ")}...`);
return pmcList;
}

console.error("No data found...");
return pmcList;
}

/** All data that has already been retrieved */
let dataRetrieved;
/** All publications that failed to be scraped or had some issue with them */
Expand Down
63 changes: 63 additions & 0 deletions src/scripts/get-pmc-list.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import axios from "axios";
import { JSDOM } from "jsdom";

/**
* Retrieve PMCs from NCBI's/NIHs ENTREZ database.
* @param {String} species Species name(s) to search for (default "Arabidopsis thaliana")
* @param {Number} maxIDs Maximum number of IDs to return (default 10000000)
* @returns {Array} List of PMCs
*/
export async function getPMCList(species = "Arabidopsis thaliana", maxIDs = 10000000) {
// Clean arguments
species = species.trim().split(" ").join("_");
maxIDs = Number.isNaN(maxIDs) ? 10000000 : maxIDs;

// Build API URL
/** ENTREZ's esearch */
const base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?";
/** Database */
const db = "db=pmc&";
/** What is being searched for */
const term = `term=${species}[Organism]&`;
/** Maximum number of publications */
const retmax = `retmax=${maxIDs}`;

/** API's URL */
const url = base + db + term + retmax;

// console feedback
console.log(`Retrieving ${species.split("_").join(" ")} PMCs...`);

// Get and return XML document/data
/** API's response */
const response = await axios.get(url, {
responseType: "document",
headers: {
"Content-Type": "text/xml",
},
params: {
retmode: "xml",
},
});
/** API's data */
const data = response?.data;

// Go through XML and extract PMCs under eSEarchResult -> IdList -> Id
/** PMC list */
let pmcList = [];

if (data) {
// Parse the XML string with JSDOM
/** JSDOM document */
const dom = new JSDOM(data, { contentType: "text/xml" });

// Use JSDOM to parse XML and get all PMCs
pmcList = Array.from(dom.window.document.querySelectorAll("Id")).map((id) => id.textContent);

console.log(`Found ${pmcList.length} PMCs for ${species.split("_").join(" ")}...`);
return pmcList;
}

console.error("No data found...");
return pmcList;
}
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import axios from "axios";
import { getPMCList } from "./data-retrieval";
import { getPMCList } from "./get-pmc-list";

// Mock the axios module
jest.mock("axios");
Expand Down

0 comments on commit a645509

Please sign in to comment.