使用 Nightmare.js 爬取页面
2021-07-24

背景

此前我已有一个 scrapy 爬虫项目,一切看起来都挺美好的。能爬到动态加载的数据,用 scrapyd 管理爬虫任务,还自己写了一个创建爬虫任务的小工具。但是,写爬虫就是这样,即使矛多锐利,盾也不会差,不会出现其中某一个特别强另一个特别弱的情形。二者处在一个动态平衡的局面,可能现在是矛更厉害,过段时间盾一定会打回来的(反爬虫进化);也可能现在是盾厉害更胜一筹,但过不了多久一定会有大神破解出来(变态如 12306 都能被破解)。所以,不要追求完美和一劳永逸,保持乐观和持续学习的心态就好了! 😄

先说问题吧:

  1. python 写的。虽然也是脚本语言,调整起来很快,但是和 nodejs 比起来,还是有差别的,毕竟后者用的是和页面一样的编程语言。解析页面数据、注入脚本、模拟用户行为等等,后者肯定是优于前者。
  2. scrapy 欠缺用户交互 API,如滚屏、模拟点击等等。我才疏学浅,没找到,但是有用户交互行为的爬虫,能够很大程度上减少被反爬虫🈲的可能性。
  3. scrapy 调试时缺少浏览器窗口,无法快速编写爬虫程序。

看完这几个问题,大家应该清楚了,使用 nodejs 来写爬虫要优于 python。(当然,这仅仅是个人观点,不喜勿喷)

所以,今天的主角登场了:

Nightmare JS

为什么是它呢,而不是前端自动测试使用更广泛的 SeleniumPhantomJS 或者 Electron?emmm,因为用过。🙂

范例

声明:以下代码仅供学习使用,如果您坚持要用在产品环境或者商用,概不负责!!!

搜狗搜索(片段):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
const Nightmare = require("nightmare");
const {
IS_PROD,
NIGHTMARE_TYPE_INTERVAL,
NIGHTMARE_USER_AGENT,
NIGHTMARE_VIEWPORT,
NIGHTMARE_GOTOTIMEOUT,
NIGHTMARE_LOADTIMEOUT,
PROXY_SERVER_HOST,
PROXY_SERVER_PORT,
PROXY_USER_NAME,
proxy_user_PASSWORD,
SOGOU_NEWS_HOST,
} = require("../constants");
const {
isNotEmptyString,
get3rdPartyShortName,
persistAdverseMedias
} = require("../utility");

/** name of the third party */
let thirdPartyName = process.argv[2]; //get third party name from command line

if (!isNotEmptyString(thirdPartyName)) {
process.exit(404); // third party name is mandatory
}

/// if the third party name contains empty string ' ',
/// the full name might be cutted into multiple fragments
let argvIdx = 3;
while (isNotEmptyString(process.argv[argvIdx])) {
thirdPartyName += " " + process.argv[argvIdx];
argvIdx++;
}

/** short name of the third party */
const thirdPartyName_short = get3rdPartyShortName(thirdPartyName);

const nm = Nightmare({
// switches: {
// "proxy-server": PROXY_SERVER_HOST + ":" + PROXY_SERVER_PORT
// },
// openDevTools: {
// mode: "detach"
// },
show: !IS_PROD,
typeInterval: NIGHTMARE_TYPE_INTERVAL,
width: NIGHTMARE_VIEWPORT.width,
height: NIGHTMARE_VIEWPORT.height,
gotoTimeout: NIGHTMARE_GOTOTIMEOUT,
loadTimeout: NIGHTMARE_LOADTIMEOUT
});

const searchTerm = '"' + thirdPartyName_short + '"';

/** extract adverse media */
const extractResults = (
/** response data */
resp,
/** third party name */
vendor,
/** third party short name */
vendor_short,
/** search term for the site */
EY_Search_Term
) => {
const smartHint = document.getElementById("smart_hint_container");
if (smartHint) {
const smartHintText = smartHint.innerText.trim();
if (
smartHintText.startsWith("抱歉,没有找到与") &&
smartHintText.includes("相关的网页。")
) {
// can't find any adverse media with such search term.
resp.hasNextPage = false;
return resp;
}
}
resp.data = resp.data || [];
if (resp.data.length > 256) {
// over 256 records were found, no need to seek more pages
resp.hasNextPage = false;
return resp;
}
console.log("before extracting:", resp.data.length);
const items = document.querySelectorAll("#main .results .vrwrap");
if (items && items.length) {
console.log(items.length + " records were found...");
for (let i = 0; i < items.length; i++) {
const _ = items[i];
const myurl = document.location.href;
const List_date = new Date().toISOString().substr(0, 10);
const obj = {
vendor,
vendor_short,
List_date,
EY_Search_Term,
myurl,
title: _.querySelector(".vr-title").innerText,
href:
document.location.origin +
_.querySelector(".vr-title a").getAttribute("href"),
summary: _.querySelector(".star-wiki").innerText,
newsSite: _.querySelector(".news-from span").innerText,
newsDate: _.querySelector(".news-from span:last-child").innerText,
newsTime: "" // N/A
};
console.log("push item: ", JSON.stringify(obj));
resp.data.push(obj);
}
}
console.log("after extracting:", resp.data.length);
// scroll down to the paginator
document.getElementById("pagebar_container") &&
document
.getElementById("pagebar_container")
.scrollIntoView({ behavior: "smooth", block: "end" });
resp.hasNextPage = !!document.getElementById("sogou_next");
return resp;
};
/** fetch adverse media of next page */
const crawlNexPage = nm => {
return resp => {
debugger;
// pagination
if (resp.hasNextPage) {
console.log("crawling next page...");
return nm
.click("#sogou_next")
.wait(2048)
.wait("#main")
.evaluate(
extractResults,
resp,
thirdPartyName,
thirdPartyName_short,
searchTerm
)
.then(crawlNexPage(nm, resp));
}
return resp;
};
};

console.log("start crawling adverse media for " + searchTerm);
// main processing logic code
nm.useragent(NIGHTMARE_USER_AGENT)
//.authentication(PROXY_USER_NAME, proxy_user_PASSWORD)
.goto(SOGOU_NEWS_HOST)
.wait(4096)
.type("#query", searchTerm)
.click("#searchBtn")
.wait(2048)
.wait("#main")
.evaluate(
extractResults,
{ data: [], hasNextPage: true },
thirdPartyName,
thirdPartyName_short,
searchTerm
)
.then(crawlNexPage(nm))
.then(response => {
// persistent
console.log("persistent, amount: ", response.data.length);
return persistAdverseMedias(response.data);
})
.finally(_ => {
console.log("Done, exiting...");
process.exit(0);
});

头条新闻搜索(片段):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
const Nightmare = require("nightmare");
const {
IS_PROD,
NIGHTMARE_TYPE_INTERVAL,
NIGHTMARE_USER_AGENT,
NIGHTMARE_VIEWPORT,
NIGHTMARE_GOTOTIMEOUT,
NIGHTMARE_LOADTIMEOUT
} = require("../constants");
const {
isNotEmptyString,
get3rdPartyShortName,
persistAdverseMedias
} = require("../utility");

/** name of the third party */
let thirdPartyName = process.argv[2]; //get third party name from command line

if (!isNotEmptyString(thirdPartyName)) {
process.exit(404); // third party name is mandatory
}

/// if the third party name contains empty string ' ',
/// the full name might be cutted into multiple fragments
let argvIdx = 3;
while (isNotEmptyString(process.argv[argvIdx])) {
thirdPartyName += " " + process.argv[argvIdx];
argvIdx++;
}

/** short name of the third party */
const thirdPartyName_short = get3rdPartyShortName(thirdPartyName);

const nm = Nightmare({
// switches: {
// "proxy-server": PROXY_SERVER_HOST + ":" + PROXY_SERVER_PORT
// },
// openDevTools: {
// mode: "detach"
// },
show: !IS_PROD,
typeInterval: NIGHTMARE_TYPE_INTERVAL,
width: NIGHTMARE_VIEWPORT.width,
height: NIGHTMARE_VIEWPORT.height,
gotoTimeout: NIGHTMARE_GOTOTIMEOUT,
loadTimeout: NIGHTMARE_LOADTIMEOUT
});

// const searchTerm = '"' + thirdPartyName_short + '"';
const searchTerm = thirdPartyName;
/** 头条搜索·资讯频道 home page */
const homePage =
"https://so.toutiao.com/search?keyword=baidu.com&pd=information&source=input&dvpf=pc&aid=4916&page_num=0";

/** extract adverse media from HTTP response */
const extractResults = (
/** response data */
resp,
/** third party name */
vendor,
/** third party short name */
vendor_short,
/** search term for the site */
EY_Search_Term
) => {
console.log("extracting…");
resp.data = resp.data || [];
if (resp.data.length > 256) {
// over 256 records were found, no need to seek next page
resp.hasNextPage = false;
return resp;
}
console.log("before extracting:", resp.data.length);
const items = document.querySelectorAll(".main .result-content[data-i]");
if (items && items.length) {
console.log(items.length + " records were found...");
for (let i = 0; i < items.length; i++) {
const _ = items[i];
const myurl = document.location.href;
const List_date = new Date().toISOString().substr(0, 10);
// debugger;
const link = _.querySelector("a");
const summary = _.querySelector(".text-underline-hover");
const newsSite = _.querySelector(".cs-source-content span");
const newsDate = _.querySelector(".cs-source-content>span:last-child");
const obj = {
vendor,
vendor_short,
List_date,
EY_Search_Term,
myurl,
title: link ? link.innerText : "",
href: link ? document.location.origin + link.getAttribute("href") : '',
summary: summary ? summary.innerText : "",
newsSite: newsSite ? newsSite.innerText : "",
newsDate: newsDate ? newsDate.innerText : "",
newsTime: "" // N/A
};
console.log("push item: ", JSON.stringify(obj));
resp.data.push(obj);
}
}
console.log("after extracting:", resp.data.length);
// scroll down to the paginator
const paginator = document.querySelector(".cs-pagination");
if (paginator) paginator.scrollIntoView({ behavior: "smooth", block: "end" });
resp.hasNextPage =
paginator &&
paginator.querySelector("a:last-child") &&
paginator.querySelector("a:last-child").classList.contains("cs-button-mb");
return resp;
};
/** fetch adverse media of next page */
const crawlNexPage = nm => {
return resp => {
console.log("crawling next page…");
// pagination
if (resp.hasNextPage) {
console.log("crawling next page...");
return nm
.click(".cs-pagination a:last-child")
.wait(2048)
.wait(".main .s-result-list")
.evaluate(
extractResults,
resp,
thirdPartyName,
thirdPartyName_short,
searchTerm
)
.then(crawlNexPage(nm, resp));
}
return resp;
};
};

console.log("start crawling adverse media for " + searchTerm);
// main processing logic code
nm.useragent(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:90.0) Gecko/20100101 Firefox/90.0"
) // .useragent(NIGHTMARE_USER_AGENT)
//.authentication(PROXY_USER_NAME, proxy_user_PASSWORD)
.goto(homePage)
.wait(4096)
.wait('input[type="search"]')
.type('input[type="search"]', "") // clear first
.type('input[type="search"]', searchTerm)
// .type('input[type="search"]', "\u000d")// press the enter key
.click(".search_1sPyO_")
.wait(4096)
.wait(".main .s-result-list")
.evaluate(
extractResults,
{ data: [], hasNextPage: true },
thirdPartyName,
thirdPartyName_short,
searchTerm
)
.then(crawlNexPage(nm))
.then(response => {
// persistent
console.log(
"trying persistent adverse medias, amount: ",
response.data.length
);
return persistAdverseMedias(response.data);
})
.catch(err => {
console.error(err);
console.log(JSON.stringify(err));
})
.finally(_ => {
console.log("Done, exiting...");
process.exit(0);
});

最佳实践

  1. evaluate 其实是把方法体注入到页面里,所以,假如你的 evaluate 方法体引用了外部变量,那肯定是不行的。解决办法也很简单,使用立即执行函数(IIFE,Immediately invoked function expression)传入外部变量即可。
  2. evaluate 记得 catch,否则很容易中断执行。
  3. 记得设置 loadTimeout,因为默认是 infinite!如果某些网站(你懂的)一直不报错,也不返回结果,或者返回结果超级慢,那你的这个爬虫程序就卡在这里了。
  4. 翻页的话,使用之前的 nightmare 实例就行。

参考链接

本文链接:
content_copy https://zxs66.github.io/2021/07/24/crawling-web-via-Nightmare-js/