Skip to content

Commit

Permalink
Merge pull request #240 from SimonSiefke/feature/image
Browse files Browse the repository at this point in the history
feature: add onlyGetOpenGraphInfo image option
  • Loading branch information
jshemas authored Aug 9, 2024
2 parents 573b7ee + a221120 commit 80b1800
Show file tree
Hide file tree
Showing 6 changed files with 44 additions and 15 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ Check the return for a ```success``` flag. If success is set to true, then the u
| fetchOptions | Options that are used by the Fetch API | {} | |
| timeout | Request timeout for Fetch (Default is 10 seconds) | 10 | |
| blacklist | Pass in an array of sites you don't want ogs to run on. | [] | |
| onlyGetOpenGraphInfo | Only fetch open graph info and don't fall back on anything else. | false | |
| onlyGetOpenGraphInfo | Only fetch open graph info and don't fall back on anything else. Also accepts an array of properties for which no fallback should be used | false | |
| customMetaTags | Here you can define custom meta tags you want to scrape. | [] | |
| urlValidatorSettings | Sets the options used by validator.js for testing the URL | [Here](https://github.com/jshemas/openGraphScraper/blob/master/lib/utils.ts#L4-L17) | |

Expand Down
3 changes: 2 additions & 1 deletion index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ import type {
* for scraping Open Graph and Twitter Card info off a website.
*
* @param {object} options - The options used by Open Graph Scraper
* @param {boolean} [options.onlyGetOpenGraphInfo] - Only fetch open graph info and don't fall back on anything else.
* @param {boolean|string[]} [options.onlyGetOpenGraphInfo] - Only fetch open graph info and don't fall back on
* anything else.
* @param {object} [options.customMetaTags] - Here you can define custom meta tags you want to scrape.
* @param {object} [options.fetchOptions] - Sets the options used by fetch for the http requests
* @param {object} [options.urlValidatorSettings] - Sets the options used by validator.js for testing the URL
Expand Down
2 changes: 1 addition & 1 deletion lib/extract.ts
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ export default function extractMetaTags(body: string, options: OpenGraphScraperO
ogObject = mediaSetup(ogObject);

// if onlyGetOpenGraphInfo isn't set, run the open graph fallbacks
if (!options.onlyGetOpenGraphInfo) {
if (!options.onlyGetOpenGraphInfo || Array.isArray(options.onlyGetOpenGraphInfo)) {
ogObject = fallback(ogObject, options, $, body);

$('script').each((index, script) => {
Expand Down
32 changes: 22 additions & 10 deletions lib/fallback.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ import {
isImageTypeValid,
isUrlValid,
} from './utils';
import type { OpenGraphScraperOptions, ImageObject, OgObjectInteral } from './types';
import type {
OpenGraphScraperOptions, ImageObject, OgObjectInteral, OnlyGetOpenGraphInfoItem,
} from './types';

const doesElementExist = (selector:string, attribute:string, $: CheerioAPI) => (
$(selector).attr(attribute) && ($(selector).attr(attribute)?.length ?? 0) > 0
Expand All @@ -23,8 +25,18 @@ const doesElementExist = (selector:string, attribute:string, $: CheerioAPI) => (
*
*/
export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOptions, $: CheerioAPI, body: string) {
const shouldFallback = (key: OnlyGetOpenGraphInfoItem): boolean => {
if (!options.onlyGetOpenGraphInfo) {
return true;
}
if (options.onlyGetOpenGraphInfo === true) {
return false;
}
return !options.onlyGetOpenGraphInfo.includes(key);
};

// title fallback
if (!ogObject.ogTitle) {
if (!ogObject.ogTitle && shouldFallback('title')) {
if ($('title').text() && $('title').text().length > 0) {
ogObject.ogTitle = $('title').first().text();
} else if ($('head > meta[name="title"]').attr('content') && ($('head > meta[name="title"]').attr('content')?.length ?? 0) > 0) {
Expand All @@ -41,7 +53,7 @@ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOpt
}

// Get meta description tag if og description was not provided
if (!ogObject.ogDescription) {
if (!ogObject.ogDescription && shouldFallback('description')) {
if (doesElementExist('head > meta[name="description"]', 'content', $)) {
ogObject.ogDescription = $('head > meta[name="description"]').attr('content');
} else if (doesElementExist('head > meta[itemprop="description"]', 'content', $)) {
Expand All @@ -52,7 +64,7 @@ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOpt
}

// Get all of images if there is no og:image info
if (!ogObject.ogImage) {
if (!ogObject.ogImage && shouldFallback('image')) {
ogObject.ogImage = [];
$('img').map((index, imageElement) => {
const source: string = $(imageElement).attr('src') ?? '';
Expand Down Expand Up @@ -85,7 +97,7 @@ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOpt
}

// audio fallback
if (!ogObject.ogAudioURL && !ogObject.ogAudioSecureURL) {
if (!ogObject.ogAudioURL && !ogObject.ogAudioSecureURL && shouldFallback('audioUrl')) {
const audioElementValue: string = $('audio').attr('src') ?? '';
const audioSourceElementValue: string = $('audio > source').attr('src') ?? '';
if (doesElementExist('audio', 'src', $)) {
Expand All @@ -108,7 +120,7 @@ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOpt
}

// locale fallback
if (!ogObject.ogLocale) {
if (!ogObject.ogLocale && shouldFallback('locale')) {
if (doesElementExist('html', 'lang', $)) {
ogObject.ogLocale = $('html').attr('lang');
} else if (doesElementExist('head > meta[itemprop="inLanguage"]', 'content', $)) {
Expand All @@ -117,7 +129,7 @@ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOpt
}

// logo fallback
if (!ogObject.ogLogo) {
if (!ogObject.ogLogo && shouldFallback('logo')) {
if (doesElementExist('meta[itemprop="logo"]', 'content', $)) {
ogObject.ogLogo = $('meta[itemprop="logo"]').attr('content');
} else if (doesElementExist('img[itemprop="logo"]', 'src', $)) {
Expand All @@ -126,7 +138,7 @@ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOpt
}

// url fallback
if (!ogObject.ogUrl) {
if (!ogObject.ogUrl && shouldFallback('url')) {
if (doesElementExist('link[rel="canonical"]', 'href', $)) {
ogObject.ogUrl = $('link[rel="canonical"]').attr('href');
} else if (doesElementExist('link[rel="alternate"][hreflang="x-default"]', 'href', $)) {
Expand All @@ -135,7 +147,7 @@ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOpt
}

// date fallback
if (!ogObject.ogDate) {
if (!ogObject.ogDate && shouldFallback('date')) {
if (doesElementExist('head > meta[name="date"]', 'content', $)) {
ogObject.ogDate = $('head > meta[name="date"]').attr('content');
} else if (doesElementExist('[itemprop*="datemodified" i]', 'content', $)) {
Expand All @@ -152,7 +164,7 @@ export function fallback(ogObject: OgObjectInteral, options: OpenGraphScraperOpt
}

// favicon fallback
if (!ogObject.favicon) {
if (!ogObject.favicon && shouldFallback('favicon')) {
if (doesElementExist('link[rel="shortcut icon"]', 'href', $)) {
ogObject.favicon = $('link[rel="shortcut icon"]').attr('href');
} else if (doesElementExist('link[rel="icon"]', 'href', $)) {
Expand Down
6 changes: 4 additions & 2 deletions lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,15 @@ export interface ErrorResult {
result: OgObject;
}

export type OnlyGetOpenGraphInfoItem = 'image' | 'title' | 'description' | 'locale' | 'logo' | 'url' | 'favicon' | 'audioUrl' | 'date';

/**
* The options used by Open Graph Scraper
*
* @typeParam {string} url - URL of the site. (Required)
* @typeParam {string} [html] - You can pass in an HTML string to run ogs on it. (use without options.url)
* @typeParam {string[]} [blacklist] - Pass in an array of sites you don't want ogs to run on.
* @typeParam {boolean} [onlyGetOpenGraphInfo] - Only fetch open graph info and don't fall back on anything else.
* @typeParam {boolean | OnlyGetOpenGraphInfoItem[]} [onlyGetOpenGraphInfo] - Only fetch open graph info and don't fall back on anything else.
* @typeParam {CustomMetaTags} [customMetaTags] - Here you can define custom meta tags you want to scrape.
* @typeParam {Request} [fetchOptions] - The options passed into fetch.
* @typeParam {number} [timeout] - Number of seconds before the fetch request ends. (default is 10 seconds)
Expand All @@ -32,7 +34,7 @@ export interface OpenGraphScraperOptions {
customMetaTags?: CustomMetaTags[];
fetchOptions?: RequestInit;
html?: string;
onlyGetOpenGraphInfo?: boolean;
onlyGetOpenGraphInfo?: boolean | OnlyGetOpenGraphInfoItem[];
timeout?: number;
url?: string;
urlValidatorSettings?: ValidatorSettings;
Expand Down
14 changes: 14 additions & 0 deletions tests/unit/openGraphScraper.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,20 @@ describe('return ogs', function () {
});
});

it('when it should not fallback to image elements', function () {
mockAgent.get('http://www.test.com')
.intercept({ path: '/' })
.reply(200, basicHTML);

return ogs({ url: 'www.test.com', onlyGetOpenGraphInfo: ['image'] })
.then(function (data) {
expect(data.result.success).to.be.eql(true);
expect(data.result.ogImage).to.be.eql(undefined);
expect(data.html).to.be.eql(basicHTML);
expect(data.response).to.be.a('response');
});
});

it('when meta description exist while og description does not', function () {
mockAgent.get('http://www.test.com')
.intercept({ path: '/' })
Expand Down

0 comments on commit 80b1800

Please sign in to comment.