-
Notifications
You must be signed in to change notification settings - Fork 0
/
ScrapperJob.js
112 lines (97 loc) · 2.79 KB
/
ScrapperJob.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import { ElementHandle } from "puppeteer";
export default class ScrapperJob {
#page;
controller;
savePath;
constructor(page, savePath, controller) {
this.#page = page
this.savePath = savePath
this.controller = controller
}
async run() {
let nextButton = null;
do {
let articleList = await this.getArticlesInPage()
}
while(nextButton != null);
let name = await this.getArticleName(articleList[0])
console.log(`opening ${name}`)
await this.openArticle(articleList[0])
await this.#page.waitForNetworkIdle({})
await this.savePdf(name)
await this.#page.goBack({
waitUntil: 'networkidle0'
})
}
/**
*
* @returns {[ElementHandle]}
*/
async getArticlesInPage() {
let listContainer = await this.#page.$('div.bookList');
return await listContainer.$$('a');
}
async getArticleName(article) {
if (!article) return;
let name = await article.$eval(
'.caption',
node => node.innerText.replace('\n', ' ')
)
return name
}
async openArticle(article) {
await article.click();
}
async savePdf(fileName) {
const path = this.savePath + fileName + '.pdf'
await this.#page.emulateMediaType('screen')
console.log(`save pdf to ${path}`)
await this.#page.pdf({
path: path,
width: parseInt(process.env.VIEWWIDTH) || 1920,
height: parseInt(process.env.VIEWHEIGHT) || 1080,
});
}
/**
*
* @param {Page} page
* @returns { [ {name: string, article: string} ] }
*/
async getArticlesInpage(page) {
let list = [];
let bookList = await page.$$('a.bookItem');
console.log(`size: ${bookList.length}`)
for (let item of bookList) {
let article = {}
//get the link and title
article = await item.evaluate(ar => {
let name = ar.querySelector('.caption').innerText
name = name.replace('\n', ' ')
let link = ar.href
return {
name,
article: link
}
});
article.article = item.href
//not sure why can push and stoped
list.push(article)
}
return list;
}
/**
* @param {Page} page
* @returns {string} the url of the next button
*/
async getNextPageButton(page) {
return await page.$eval('span.step-links > a.right', (btn) => btn.href);
}
/**
*
* @param {[ElementHandle]} articles
*/
async processArticlesInPage(articles) {
if(articles) {
}
}
}