From 3d1883020ae5bde401f65a93987a90f86a3336aa Mon Sep 17 00:00:00 2001 From: Simon Siefke Date: Tue, 30 Jul 2024 14:07:36 +0200 Subject: [PATCH 1/8] feature: add onlyGetOpenGraphInfo image option --- lib/fallback.ts | 2 +- lib/types.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/fallback.ts b/lib/fallback.ts index 554e3cb..591e838 100644 --- a/lib/fallback.ts +++ b/lib/fallback.ts @@ -52,7 +52,7 @@ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOpt } // Get all of images if there is no og:image info - if (!ogObject.ogImage) { + if (!ogObject.ogImage && options.onlyGetOpenGraphInfo !== 'image') { ogObject.ogImage = []; $('img').map((index, imageElement) => { const source: string = $(imageElement).attr('src') ?? ''; diff --git a/lib/types.ts b/lib/types.ts index b7182ea..590919a 100644 --- a/lib/types.ts +++ b/lib/types.ts @@ -32,7 +32,7 @@ export interface OpenGraphScraperOptions { customMetaTags?: CustomMetaTags[]; fetchOptions?: RequestInit; html?: string; - onlyGetOpenGraphInfo?: boolean; + onlyGetOpenGraphInfo?: boolean | 'image'; timeout?: number; url?: string; urlValidatorSettings?: ValidatorSettings; From c715e16282835459f99505803f7fbe5818f29af6 Mon Sep 17 00:00:00 2001 From: Simon Siefke Date: Wed, 31 Jul 2024 13:49:11 +0200 Subject: [PATCH 2/8] fix image fallback --- lib/extract.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/extract.ts b/lib/extract.ts index 30067f8..4c09b3c 100644 --- a/lib/extract.ts +++ b/lib/extract.ts @@ -89,7 +89,7 @@ export default function extractMetaTags(body: string, options: OpenGraphScraperO ogObject = mediaSetup(ogObject); // if onlyGetOpenGraphInfo isn't set, run the open graph fallbacks - if (!options.onlyGetOpenGraphInfo) { + if (!options.onlyGetOpenGraphInfo || options.onlyGetOpenGraphInfo === 'image') { ogObject = fallback(ogObject, options, $, body); $('script').each((index, script) => { From d4d5043fed6ac8e22c9f79b8cbcc365de8f63265 Mon Sep 17 00:00:00 2001 From: Simon Siefke Date: Thu, 1 Aug 2024 00:36:02 +0200 Subject: [PATCH 3/8] update options --- index.ts | 2 +- lib/extract.ts | 2 +- lib/fallback.ts | 31 +++++++++++++++++++++---------- lib/types.ts | 6 ++++-- 4 files changed, 27 insertions(+), 14 deletions(-) diff --git a/index.ts b/index.ts index 89214d8..f66f1b5 100644 --- a/index.ts +++ b/index.ts @@ -11,7 +11,7 @@ import type { * for scraping Open Graph and Twitter Card info off a website. * * @param {object} options - The options used by Open Graph Scraper - * @param {boolean} [options.onlyGetOpenGraphInfo] - Only fetch open graph info and don't fall back on anything else. + * @param {boolean|string[]} [options.onlyGetOpenGraphInfo] - Only fetch open graph info and don't fall back on anything else. * @param {object} [options.customMetaTags] - Here you can define custom meta tags you want to scrape. * @param {object} [options.fetchOptions] - Sets the options used by fetch for the http requests * @param {object} [options.urlValidatorSettings] - Sets the options used by validator.js for testing the URL diff --git a/lib/extract.ts b/lib/extract.ts index 4c09b3c..7e3f644 100644 --- a/lib/extract.ts +++ b/lib/extract.ts @@ -89,7 +89,7 @@ export default function extractMetaTags(body: string, options: OpenGraphScraperO ogObject = mediaSetup(ogObject); // if onlyGetOpenGraphInfo isn't set, run the open graph fallbacks - if (!options.onlyGetOpenGraphInfo || options.onlyGetOpenGraphInfo === 'image') { + if (!options.onlyGetOpenGraphInfo || Array.isArray(options.onlyGetOpenGraphInfo)) { ogObject = fallback(ogObject, options, $, body); $('script').each((index, script) => { diff --git a/lib/fallback.ts b/lib/fallback.ts index 591e838..acb459e 100644 --- a/lib/fallback.ts +++ b/lib/fallback.ts @@ -7,7 +7,7 @@ import { isImageTypeValid, isUrlValid, } from './utils'; -import type { OpenGraphScraperOptions, ImageObject, OgObjectInteral } from './types'; +import type { OpenGraphScraperOptions, ImageObject, OgObjectInteral, OnlyGetOpenGraphInfoItem } from './types'; const doesElementExist = (selector:string, attribute:string, $: CheerioAPI) => ( $(selector).attr(attribute) && ($(selector).attr(attribute)?.length ?? 0) > 0 @@ -23,8 +23,19 @@ const doesElementExist = (selector:string, attribute:string, $: CheerioAPI) => ( * */ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOptions, $: CheerioAPI, body: string) { + + const shouldFallback = (key: OnlyGetOpenGraphInfoItem): boolean => { + if(!options.onlyGetOpenGraphInfo){ + return true; + } + if(options.onlyGetOpenGraphInfo === true){ + return false; + } + return !options.onlyGetOpenGraphInfo.includes(key); + }; + // title fallback - if (!ogObject.ogTitle) { + if (!ogObject.ogTitle && shouldFallback('title')) { if ($('title').text() && $('title').text().length > 0) { ogObject.ogTitle = $('title').first().text(); } else if ($('head > meta[name="title"]').attr('content') && ($('head > meta[name="title"]').attr('content')?.length ?? 0) > 0) { @@ -41,7 +52,7 @@ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOpt } // Get meta description tag if og description was not provided - if (!ogObject.ogDescription) { + if (!ogObject.ogDescription && shouldFallback('description')) { if (doesElementExist('head > meta[name="description"]', 'content', $)) { ogObject.ogDescription = $('head > meta[name="description"]').attr('content'); } else if (doesElementExist('head > meta[itemprop="description"]', 'content', $)) { @@ -52,7 +63,7 @@ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOpt } // Get all of images if there is no og:image info - if (!ogObject.ogImage && options.onlyGetOpenGraphInfo !== 'image') { + if (!ogObject.ogImage && shouldFallback('image')) { ogObject.ogImage = []; $('img').map((index, imageElement) => { const source: string = $(imageElement).attr('src') ?? ''; @@ -85,7 +96,7 @@ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOpt } // audio fallback - if (!ogObject.ogAudioURL && !ogObject.ogAudioSecureURL) { + if (!ogObject.ogAudioURL && !ogObject.ogAudioSecureURL && shouldFallback('audioUrl')) { const audioElementValue: string = $('audio').attr('src') ?? ''; const audioSourceElementValue: string = $('audio > source').attr('src') ?? ''; if (doesElementExist('audio', 'src', $)) { @@ -108,7 +119,7 @@ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOpt } // locale fallback - if (!ogObject.ogLocale) { + if (!ogObject.ogLocale && shouldFallback('locale')) { if (doesElementExist('html', 'lang', $)) { ogObject.ogLocale = $('html').attr('lang'); } else if (doesElementExist('head > meta[itemprop="inLanguage"]', 'content', $)) { @@ -117,7 +128,7 @@ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOpt } // logo fallback - if (!ogObject.ogLogo) { + if (!ogObject.ogLogo && shouldFallback('logo')) { if (doesElementExist('meta[itemprop="logo"]', 'content', $)) { ogObject.ogLogo = $('meta[itemprop="logo"]').attr('content'); } else if (doesElementExist('img[itemprop="logo"]', 'src', $)) { @@ -126,7 +137,7 @@ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOpt } // url fallback - if (!ogObject.ogUrl) { + if (!ogObject.ogUrl && shouldFallback('url')) { if (doesElementExist('link[rel="canonical"]', 'href', $)) { ogObject.ogUrl = $('link[rel="canonical"]').attr('href'); } else if (doesElementExist('link[rel="alternate"][hreflang="x-default"]', 'href', $)) { @@ -135,7 +146,7 @@ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOpt } // date fallback - if (!ogObject.ogDate) { + if (!ogObject.ogDate && shouldFallback('date')) { if (doesElementExist('head > meta[name="date"]', 'content', $)) { ogObject.ogDate = $('head > meta[name="date"]').attr('content'); } else if (doesElementExist('[itemprop*="datemodified" i]', 'content', $)) { @@ -152,7 +163,7 @@ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOpt } // favicon fallback - if (!ogObject.favicon) { + if (!ogObject.favicon && shouldFallback('favicon')) { if (doesElementExist('link[rel="shortcut icon"]', 'href', $)) { ogObject.favicon = $('link[rel="shortcut icon"]').attr('href'); } else if (doesElementExist('link[rel="icon"]', 'href', $)) { diff --git a/lib/types.ts b/lib/types.ts index 590919a..2c8def6 100644 --- a/lib/types.ts +++ b/lib/types.ts @@ -15,13 +15,15 @@ export interface ErrorResult { result: OgObject; } +export type OnlyGetOpenGraphInfoItem = 'image' | 'title' | 'description' | 'locale' | 'logo' | 'url' | 'favicon' | 'audioUrl' | 'date'; + /** * The options used by Open Graph Scraper * * @typeParam {string} url - URL of the site. (Required) * @typeParam {string} [html] - You can pass in an HTML string to run ogs on it. (use without options.url) * @typeParam {string[]} [blacklist] - Pass in an array of sites you don't want ogs to run on. - * @typeParam {boolean} [onlyGetOpenGraphInfo] - Only fetch open graph info and don't fall back on anything else. + * @typeParam {boolean | OnlyGetOpenGraphInfoItem[]} [onlyGetOpenGraphInfo] - Only fetch open graph info and don't fall back on anything else. * @typeParam {CustomMetaTags} [customMetaTags] - Here you can define custom meta tags you want to scrape. * @typeParam {Request} [fetchOptions] - The options passed into fetch. * @typeParam {number} [timeout] - Number of seconds before the fetch request ends. (default is 10 seconds) @@ -32,7 +34,7 @@ export interface OpenGraphScraperOptions { customMetaTags?: CustomMetaTags[]; fetchOptions?: RequestInit; html?: string; - onlyGetOpenGraphInfo?: boolean | 'image'; + onlyGetOpenGraphInfo?: boolean | OnlyGetOpenGraphInfoItem[]; timeout?: number; url?: string; urlValidatorSettings?: ValidatorSettings; From dee9f97c50e0725d5db6754912c72fc8877529c4 Mon Sep 17 00:00:00 2001 From: Simon Siefke Date: Thu, 1 Aug 2024 00:40:51 +0200 Subject: [PATCH 4/8] add test --- tests/unit/openGraphScraper.spec.ts | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/unit/openGraphScraper.spec.ts b/tests/unit/openGraphScraper.spec.ts index 8cf9513..a8ef1e7 100644 --- a/tests/unit/openGraphScraper.spec.ts +++ b/tests/unit/openGraphScraper.spec.ts @@ -149,6 +149,22 @@ describe('return ogs', function () { }); }); + it('when it should not fallback to image elements', function () { + mockAgent.get('http://www.test.com') + .intercept({ path: '/' }) + .reply(200, multipleImageHTML); + + return ogs({ url: 'www.test.com', onlyGetOpenGraphInfo: ['image'] }) + .then(function (data) { + expect(data.result.success).to.be.eql(true); + expect(data.result.ogTitle).to.be.eql('test page'); + expect(data.result.ogImage).to.be.eql([]); + expect(data.result.requestUrl).to.be.eql('http://www.test.com'); + expect(data.html).to.be.eql(multipleImageHTML); + expect(data.response).to.be.a('response'); + }); + }); + it('when meta description exist while og description does not', function () { mockAgent.get('http://www.test.com') .intercept({ path: '/' }) From a2478ed2ad439811d6b574ef0aca91ea21437089 Mon Sep 17 00:00:00 2001 From: Simon Siefke Date: Thu, 1 Aug 2024 00:41:39 +0200 Subject: [PATCH 5/8] fix test --- tests/unit/openGraphScraper.spec.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/openGraphScraper.spec.ts b/tests/unit/openGraphScraper.spec.ts index a8ef1e7..81d5408 100644 --- a/tests/unit/openGraphScraper.spec.ts +++ b/tests/unit/openGraphScraper.spec.ts @@ -152,7 +152,7 @@ describe('return ogs', function () { it('when it should not fallback to image elements', function () { mockAgent.get('http://www.test.com') .intercept({ path: '/' }) - .reply(200, multipleImageHTML); + .reply(200, basicHTML); return ogs({ url: 'www.test.com', onlyGetOpenGraphInfo: ['image'] }) .then(function (data) { From 8f23ccea7d92bd61a63852ee9f4f34a5a8b64bd9 Mon Sep 17 00:00:00 2001 From: Simon Siefke Date: Thu, 1 Aug 2024 00:42:34 +0200 Subject: [PATCH 6/8] fix test --- tests/unit/openGraphScraper.spec.ts | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/unit/openGraphScraper.spec.ts b/tests/unit/openGraphScraper.spec.ts index 81d5408..98101ed 100644 --- a/tests/unit/openGraphScraper.spec.ts +++ b/tests/unit/openGraphScraper.spec.ts @@ -157,10 +157,8 @@ describe('return ogs', function () { return ogs({ url: 'www.test.com', onlyGetOpenGraphInfo: ['image'] }) .then(function (data) { expect(data.result.success).to.be.eql(true); - expect(data.result.ogTitle).to.be.eql('test page'); - expect(data.result.ogImage).to.be.eql([]); - expect(data.result.requestUrl).to.be.eql('http://www.test.com'); - expect(data.html).to.be.eql(multipleImageHTML); + expect(data.result.ogImage).to.be.eql(undefined); + expect(data.html).to.be.eql(basicHTML); expect(data.response).to.be.a('response'); }); }); From f8eecab0f0701772ac42b4e95ff2e1229f003e0a Mon Sep 17 00:00:00 2001 From: Simon Siefke Date: Thu, 1 Aug 2024 07:20:16 +0200 Subject: [PATCH 7/8] fix eslint --- index.ts | 3 ++- lib/fallback.ts | 11 ++++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/index.ts b/index.ts index f66f1b5..d615ed8 100644 --- a/index.ts +++ b/index.ts @@ -11,7 +11,8 @@ import type { * for scraping Open Graph and Twitter Card info off a website. * * @param {object} options - The options used by Open Graph Scraper - * @param {boolean|string[]} [options.onlyGetOpenGraphInfo] - Only fetch open graph info and don't fall back on anything else. + * @param {boolean|string[]} [options.onlyGetOpenGraphInfo] - Only fetch open graph info and don't fall back on + * anything else. * @param {object} [options.customMetaTags] - Here you can define custom meta tags you want to scrape. * @param {object} [options.fetchOptions] - Sets the options used by fetch for the http requests * @param {object} [options.urlValidatorSettings] - Sets the options used by validator.js for testing the URL diff --git a/lib/fallback.ts b/lib/fallback.ts index acb459e..7163978 100644 --- a/lib/fallback.ts +++ b/lib/fallback.ts @@ -7,7 +7,9 @@ import { isImageTypeValid, isUrlValid, } from './utils'; -import type { OpenGraphScraperOptions, ImageObject, OgObjectInteral, OnlyGetOpenGraphInfoItem } from './types'; +import type { + OpenGraphScraperOptions, ImageObject, OgObjectInteral, OnlyGetOpenGraphInfoItem, +} from './types'; const doesElementExist = (selector:string, attribute:string, $: CheerioAPI) => ( $(selector).attr(attribute) && ($(selector).attr(attribute)?.length ?? 0) > 0 @@ -23,12 +25,11 @@ const doesElementExist = (selector:string, attribute:string, $: CheerioAPI) => ( * */ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOptions, $: CheerioAPI, body: string) { - const shouldFallback = (key: OnlyGetOpenGraphInfoItem): boolean => { - if(!options.onlyGetOpenGraphInfo){ - return true; + if (!options.onlyGetOpenGraphInfo) { + return true; } - if(options.onlyGetOpenGraphInfo === true){ + if (options.onlyGetOpenGraphInfo === true) { return false; } return !options.onlyGetOpenGraphInfo.includes(key); From a221120f84a1ec00a7377483567ad17b3988bae9 Mon Sep 17 00:00:00 2001 From: Simon Siefke Date: Thu, 1 Aug 2024 07:20:20 +0200 Subject: [PATCH 8/8] update readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 16dea42..e22089c 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ Check the return for a ```success``` flag. If success is set to true, then the u | fetchOptions | Options that are used by the Fetch API | {} | | | timeout | Request timeout for Fetch (Default is 10 seconds) | 10 | | | blacklist | Pass in an array of sites you don't want ogs to run on. | [] | | -| onlyGetOpenGraphInfo | Only fetch open graph info and don't fall back on anything else. | false | | +| onlyGetOpenGraphInfo | Only fetch open graph info and don't fall back on anything else. Also accepts an array of properties for which no fallback should be used | false | | | customMetaTags | Here you can define custom meta tags you want to scrape. | [] | | | urlValidatorSettings | Sets the options used by validator.js for testing the URL | [Here](https://github.com/jshemas/openGraphScraper/blob/master/lib/utils.ts#L4-L17) | |