Node.js 子进程
2021-12-30

很久之前,我写过几个爬虫程序(个人学习用途!!!)。之前为了避免被反爬程序识别,特地一个一个串行执行,还降低爬取速度。但是多个目标站点就没必要自己给自己设限了。

我的设想(需求)是:拿到一个 search term,我会把它分发给多个搜索引擎的爬虫程序(比如百度、谷歌、必应)同时搜。

说干就干。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
const fs = require("fs");
const path = require("path");
const spawnProcess = require("child_process");
const { exit } = require("process");

const { REQUEST_DELAY } = require("./constants");

const jobsFolder = "./jobs";

/** send the search job to sogouCrawler */
const invokeSogouCrawler = searchTerm => {
return new Promise((resolve, reject) => {
console.log("start crawling sogou news for " + searchTerm);
setTimeout(() => {
const p = spawnProcess.spawn("node", [
"./crawlers/sogouCrawler.js",
searchTerm
]);
p.on("close", function(spawnCode) {
console.log("complete crawling sogou news for " + searchTerm);
resolve(spawnCode);
});
setTimeout(resolve, 1048576); // bugfix: the program just stock there
}, REQUEST_DELAY);
});
};

/** send the search job to toutiaoCrawler */
const invokeToutiaoCrawler = searchTerm => {
return new Promise((resolve, reject) => {
console.log("start crawling toutiao news for " + searchTerm);
setTimeout(() => {
const p = spawnProcess.spawn("node", [
"./crawlers/toutiaoCrawler.js",
searchTerm
]);
p.on("close", function(spawnCode) {
console.log("complete crawling toutiao news for " + searchTerm);
resolve(spawnCode);
});
setTimeout(resolve, 1048576); // bugfix: the program just stock there
}, REQUEST_DELAY);
});
};

// get job list
fs.readdir(jobsFolder, (err, files) => {
if (err) {
console.error(err);
exit();
}
if (files == null || files.length === 0) {
console.warn("no job file was found!");
exit();
}
// get the lastest job file
let theLatestFile = path.join(jobsFolder, files[0]);
let theLatestMTime = new Date();
for (let file of files) {
const filePath = path.join(jobsFolder, file);
const fileMTime = fs.statSync(filePath).mtime;
if (theLatestFile === filePath) {
theLatestMTime = fileMTime;
continue;
}
if (fileMTime > theLatestMTime) {
theLatestFile = filePath;
theLatestMTime = fileMTime;
}
}
// read content from the latest job file
fs.readFile(theLatestFile, "utf8", (err, content) => {
if (err) {
console.error(err);
exit();
}
if (content && content.length) {
const rows = content
.split("\n")
.filter(_ => _ && _.length)
.map(_ => _.trim());
const tprs = Array.from(
new Set(
rows
.map(_ => _.split("\t"))
.reduce((prev, curr) => {
return prev.concat(curr);
}, [])
.filter(_ => _.length >= 5)
)
);
if (tprs.length) {
console.log(tprs);
const allDoneSignal = "Winner, Winner, Chicken Dinner~";
/** invoke sogou crawler one by one */
const nextSogouCrawlItem = (idx_sogou = 0) => {
if (idx_sogou === tprs.length) {
return Promise.resolve(allDoneSignal);
}
const p = invokeSogouCrawler(tprs[idx_sogou]);
return p.then(() => {
idx_sogou++;
return nextSogouCrawlItem(idx_sogou);
});
};
/** invoke toutiao crawler one by one */
const nextToutiaoCrawlItem = (idx_toutiao = 0) => {
if (idx_toutiao === tprs.length) {
return Promise.resolve(allDoneSignal);
}
const p = invokeToutiaoCrawler(tprs[idx_toutiao]);
return p.then(() => {
idx_toutiao++;
return nextToutiaoCrawlItem(idx_toutiao);
});
};
let p_sogou = nextSogouCrawlItem();
let p_toutiao = nextToutiaoCrawlItem();
// let p_toutiao = Promise.resolve(allDoneSignal); // disable temporary
Promise.all([p_sogou, p_toutiao]).then(exitCode => {
// exit code should always be equals to allDoneSignal
console.log(exitCode);
exit();
});
}
} else {
console.warn("empty job file: ", theLatestFile);
exit();
}
});
});

参考链接

本文链接:
content_copy https://zxs66.github.io/2021/12/30/Node-js-child-processes/