【实战】Ozon产品列表页及产品详情页nodejs爬虫

      
      
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
      
      
{
"name": "1",
"version": "1.0.0",
"description": "",
"main": "index.js",
"dependencies": {
"fs": "^0.0.1-security",
"https": "^1.0.0",
"iconv-lite": "^0.4.21",
"jsdom": "^11.7.0"
},
"devDependencies": {},
"scripts": {
"test": "echo " Error: no test specified " && exit 1"
},
"author": "",
"license": "ISC"
}

代码

      
      
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
      
      
// config/index.js
module.exports = {
options: {
hostname: 'www.ozon.ru',
port: 443,
path: '/context/detail/id/144054492/',
method: 'GET',
headers: {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'if-modified-since': 'Fri, 08 Jun 2018 03:42:08 GMT',
'referer': 'https://www.ozon.ru/catalog/1133763/?type=48856',
'upgrade-insecure-requests': 1,
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36'
}
},
baseURL: 'https://www.ozon.ru',
timeout: 3000
}
// tool/generateExcel.js
// fs 文件系统模块
var fs = require( 'fs')
// 读取json中的数据,用String的concat函数拼接
var datas = require( '../tmp/mid_output.json')
var culs = new Object()
// 数据处理函数,将数据中的','和'n'替换成','和';'
var prepare = str => {
if (str === undefined) {
return null
}
else {
// 这里的/,/g和/n/g是正则表达式
return str.replace( /,/g, ',').replace( /n/g, ';')
}
}
// 删除没有params的数据
var num = 0;
for( var i = 0;i<datas.length;i++){
if(datas[i].params === undefined){
datas.splice(i, 1);
}
}
var num1 = 0;
for( var i = 0;i<datas.length;i++){
if(datas[i].params == undefined){
num1 ++;
}
}
datas.splice(datas.length-num1,num1);
for ( var data of datas) {
// console.log(data)
for ( var param of data.params) {
// 将datas中的所有的params的key,添加到culs中
if (culs[param.key] === undefined) culs[param.key] = true
// 将datas中的params[i].key和params[i].value变成data[param.key] = param.value
data[param.key] = param.value
}
// 删除data中的params
delete data.params
}
// console.log(datas[0].prototype === datas[1].prototype)
// console.log(culs)
// console.log(datas)
var columnsName = 'number,href,img,name,price,cnum'
for ( var key in culs) {
columnsName += ',' + prepare(key)
}
columnsName += 'n'
fs.writeFileSync(__dirname + '/../output/output.csv', columnsName, { flag: 'a'}, err => console.log(err))
var cnt = 1
for ( var data of datas) {
var str = ''
str += cnt++
str += ',' + prepare(data.href)
str += ',' + prepare(data.img)
str += ',' + prepare(data.name)
str += ',' + prepare(data.price)
str += ',' + prepare(data.cnum)
for ( var key in culs) {
str += ',' + prepare(data[key])
}
str += 'n'
// 同步的写文件,将str写到'output.txt',将flag设置为'a',即append,将数据追加到源文件结尾
fs.writeFileSync(__dirname + '/../output/output.csv', str, { flag: 'a'}, err => console.log(err))
}
console.log(cnt)
// tool/get.js
const https = require( 'https')
const fs = require( 'fs')
const iconv = require( 'iconv-lite')
const jsdom = require( 'jsdom')
const { JSDOM } = jsdom
var config = require( '../config')
var items = new Array()
var getInput = () => {
var result = fs.readFileSync(__dirname + '/../input/input.txt')
const dom = new JSDOM(result.toString())
// const dom = new JSDOM('<div><div class="a">1<div/><div class="a"><div>2<div/><div/><div/>')
var lines = dom.window.document.getElementsByClassName( 'bOneTile inline')
var i = 1
for ( var line of lines) {
// console.log(line['href'])
var href = line.getElementsByClassName( 'eOneTile_link')[ 0].href
//链接找整个div参数data-itemid的值
//var id = line.getAttribute('data-itemid')
//图片找第一个eOneTile_image_link的参数data-image-src
var img = line.getElementsByClassName( 'eOneTile_image_link')[ 0].getAttribute( 'data-image-src')
//商品名找整个div参数的data-name
var name = line.getAttribute( 'data-name')
//价钱找整个div参数的data-price
var price = (line.getAttribute( 'data-price') !== undefined) ? line.getAttribute( 'data-price') : 'null'
//评论数找eOneTile_ReviewsCount的innerhtml
var cnum = (line.getElementsByClassName( 'eOneTile_ReviewsCount')[ 0] === undefined ? '0' : line.getElementsByClassName( 'eOneTile_ReviewsCount')[ 0].innerHTML)
items.push({
href: href,
img: img,
name: name,
price: price,
cnum: cnum
})
// console.log(i++ + ' ' + href + ' ' + cnum)
}
}
var getDetail = idx => {
var item = items[idx]
config.options.path = item.href
const req = https.get(config.options, res => {
var datas = []
var size = 0
res.on( 'data', data => {
datas.push(data)
size += data.length
// console.log(data)
})
res.on( 'end', () => {
var buff = Buffer.concat(datas, size)
var result = iconv.decode(buff, 'win1251')
// console.log(result)
// fs.writeFile('out',result, err => console.log(err))
const dom = new JSDOM(result.toString())
// const dom = new JSDOM('<div><div class="a">1<div/><div class="a"><div>2<div/><div/><div/>')
var lines = dom.window.document.getElementsByClassName( 'eItemProperties_line')
item.params = new Array()
for ( var line of lines) {
var key = line.childNodes[ 1].innerHTML
var value = line.childNodes[ 3].innerHTML
item.params.push({
key: key,
value: value
})
}
})
})
req.end()
config.options.headers.referer = config.baseURL + item.href // 这里修正了他的referer,模拟浏览器
}
var getAllDetail = (idx, end) => () => {
if (idx < end) {
console.log(idx)
getDetail(idx)
setTimeout(getAllDetail(idx + 1, end), config.timeout)
}
else {
// console.log(items)
setOutput()
}
}
var setOutput = () => {
fs.writeFile(__dirname + '/../tmp/mid_output.json', JSON.stringify(items), err => console.log(err))
}
getInput()
setOutput()
getAllDetail( 0, items.length)()

使用说明

1.打开Qzon产品列表

2.按F12,选择列表div,右键copy->copy element

3.打开input.txt,将数据删除后粘贴新的数据

4.Ozon.ru点击页面左上角安全Cookies全部禁用,点击一个产品刷新,F12点击Network找到第一个(数字串),复制request headers

5.粘贴到index.js,对比上下修改

6.运行 node toolget.js和node toolgenerateExcel.js

7.注意完成后将页面的cookies屏蔽解除

原文:大专栏  【实战】Ozon产品列表页及产品详情页nodejs爬虫


猜你喜欢

转载自www.cnblogs.com/petewell/p/11601733.html