Skip to content

Commit

Permalink
Added ability to resume from cache
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexJSully committed Aug 26, 2024
1 parent 4db21a0 commit f6710f3
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 16 deletions.
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,19 @@ Then run
npm install
```

followed by
### Running locally

To start and run the publication figure retrieval tool, run the following command:

```bash
npm start
npm run start
```

This tool runs within your Node.js environment. On Windows, this script may need to run in administrator mode.
If you chose to cancel this process at any time, you can resume and continue where you left off by running the same command. It will store the already processed PMC IDs in `build/output/cache/id.json`. To reset the cache, delete the `id.json` file.

### Usage

The images are downloaded locally within the `build/processor/output` directory.
The images are downloaded locally within the `build/output` directory. They are organized by species then by publication ID.

### API Key

Expand Down
23 changes: 23 additions & 0 deletions src/processor/fetchArticleDetails.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import axios from "axios";
import fs from "fs";
import path from "path";
import { fetchArticleDetails } from "./fetchArticleDetails";
import { parseFigures } from "./parseFigures";

Expand All @@ -9,9 +11,13 @@ describe("fetchArticleDetails", () => {
const throttle = jest.fn((fn) => fn());
const pmids = ["PMC123456", "PMC654321"];
const species = "Homo sapiens";
const cachedIDsFilePath = path.resolve(__dirname, "../../output/data/id.json");

beforeEach(() => {
jest.clearAllMocks();
if (fs.existsSync(cachedIDsFilePath)) {
fs.unlinkSync(cachedIDsFilePath);
}
});

it("should fetch article details in batches and call parseFigures", async () => {
Expand All @@ -38,4 +44,21 @@ describe("fetchArticleDetails", () => {

consoleErrorSpy.mockRestore();
});

it("should cache fetched IDs and skip already cached IDs", async () => {
const mockResponse = { data: "<xml>mock data</xml>" };
(axios.get as jest.Mock).mockResolvedValue(mockResponse);

// Initial fetch to cache the IDs
await fetchArticleDetails(throttle, pmids, species);

expect(fs.existsSync(cachedIDsFilePath)).toBe(true);
const cachedIDs = JSON.parse(fs.readFileSync(cachedIDsFilePath, "utf-8"));
expect(cachedIDs).toEqual(pmids);

// Fetch again with the same IDs, should skip fetching
await fetchArticleDetails(throttle, pmids, species);

expect(axios.get).toHaveBeenCalledTimes(1); // Should not call axios.get again
});
});
54 changes: 43 additions & 11 deletions src/processor/fetchArticleDetails.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
import axios from "axios";
import fs from "fs";
import path from "path";
import { parseFigures } from "./parseFigures";

/**
Expand All @@ -19,34 +21,64 @@ import { parseFigures } from "./parseFigures";
export async function fetchArticleDetails(
/** The throttling function to control the rate of API requests. */
throttle: any,
/** An array of PMCIDs to fetch details for. */
/** An array of PMC IDs to fetch details for. */
pmids: string[],
/** The species name to be used in the processing of figures. */
species: string,
): Promise<void> {
/** Number of PMCIDs per batch. */
/** Number of PMC IDs per batch. */
const batchSize = 50;

// Grab cached IDs
/** Path to the cached IDs file. */
const cachedIDsFilePath = path.resolve(__dirname, "../output/cache/id.json");
/** Cached IDs list. */
let cachedIDs: string[] = [];
// Check if the cached IDs file exists
if (fs.existsSync(cachedIDsFilePath)) {
const data = fs.readFileSync(cachedIDsFilePath, "utf-8");
cachedIDs = JSON.parse(data);
} else {
// Create the directory if it doesn't exist
fs.mkdirSync(path.dirname(cachedIDsFilePath), { recursive: true });
}

// Get article details based on PMC IDs
for (let i = 0; i < pmids.length; i += batchSize) {
// Extract a batch of 50 PMCIDs
/** A batch of 50 PMCIDs. */
// Extract a batch of 50 PMC IDs
const batch = pmids.slice(i, i + batchSize);
/** Comma-separated list of PMCIDs. */
const ids = batch.join(",");
/** The URL to fetch article details for the current batch. */

// Filter out IDs that are already cached
const newBatch = batch.filter((id) => !cachedIDs.includes(id));

if (newBatch.length === 0) {
console.log(
`All IDs in ${species.replace("_", " ")} batch ${i + 1}-${i + batch.length} are already cached.`,
);

continue;
}

/** Comma-separated string of PMC IDs for the batch. */
const ids = newBatch.join(",");
/** URL for fetching article details from the NCBI API. */
let url = `https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=${ids}&retmode=xml`;
// Check if there is a NCBI API key available and if so, add it to the URL
// Add the API key if available
if (process?.env?.NCBI_API_KEY) {
url += `&api_key=${process.env.NCBI_API_KEY}`;
}

console.log(`Fetching article details for batch ${i + 1}-${i + batch.length}...`);
console.log(
`Fetching ${species.replace("_", " ")} article details for batch ${i + 1}-${i + newBatch.length}...`,
);

try {
// Make HTTP request to fetch article details in XML format for the current batch
/** The response from the API request. */
const response = await throttle(async () => await axios.get(url));
await parseFigures(throttle, response.data, species);

// Add the new IDs to the cached list and write to the file
cachedIDs.push(...newBatch);
fs.writeFileSync(cachedIDsFilePath, JSON.stringify(cachedIDs, null, 2));
} catch (error) {
console.error("Error fetching article details:", error);
}
Expand Down
2 changes: 1 addition & 1 deletion src/processor/parseFigures.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ export async function parseFigures(
// Download all figures for this article
for (const url of figureUrls) {
// Create the directory path for species and PMC ID
const outputDir = path.join(__dirname, "output", species, pmcId);
const outputDir = path.join(__dirname, "../output", species, pmcId);
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
Expand Down

0 comments on commit f6710f3

Please sign in to comment.