import requests
import urllib3
from fake_useragent import UserAgent
from lxml import etree
import csv
import time
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def get_html(page, keyword, header):
url = f'https://search.51job.com/list/010000,000000,0000,00,9,99,{keyword},2,{page}.html?'
response = requests.get(url, verify=False, headers=header)
if response.status_code == 200:
response.encoding = response.apparent_encoding
return response.text
def parse_html(html):
data = etree.HTML(html)
table_list = data.xpath("//div[@class='dw_table']/div[@class='el']")
datas = []
for info in table_list:
name = info.xpath('p//a/text()') # 职位名称
comn = info.xpath('span/a/text()') # 公司名称
region = info.xpath('span[@class="t3"]/text()') # 地区
salary = info.xpath('span[@class="t4"]/text()') # 薪资
times = info.xpath('span[@class="t5"]/text()') # 日期
rest = []
if name:
rest.append(name[0].strip())
rest.append(comn[0].strip())
rest.append(region[0].strip())
if salary == []:
rest.append('面议')
else:
rest.append(salary[0].strip())
rest.append(times[0].strip())
print(name[0].strip(), comn[0].strip(), region[0].strip(), salary, times[0].strip())
datas.append(rest)
return datas
def save_data(data,filename='python'):
with open(f'{filename}.csv', 'a', encoding='utf-8', newline='')as f:
wo = csv.writer(f)
for i in data:
wo.writerow(i)
if __name__ == '__main__':
"""
"""
ua = UserAgent()
header = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Host": "search.51job.com",
"Referer": "http://www.51sjk.com/Upload/Articles/1/0/263/263930_20210708001823847.jpg",
"User-Agent": ua.random
}
keyword = input('请输入您要查询的岗位:')
page = int(input('请输入要获取的页数:'))
for i in range(1, page+1):
print(f'开始爬取第{i}页')
html = get_html(i, keyword, header)
time.sleep(1.1)
datas = parse_html(html)
save_data(datas,keyword)
XHTMT|
HTML5|
CSS|
HTML DOM|
jQuery|
JSON|
AJAX|
LESS|
HTML|
Bootstrap|
Foundation|
AngularJS|
Ember.js|
TypeScript|
AngularJS2|
React|
jQuery UI|
jQuery EasyUI|
Node.js|
Highcharts|
Echarts|
Vue.js|
CoffeeScript|
Ext.js|
Meteor|
SASS|
Omi|
Markdown|
前端开发规范|
浏览器|
webpack|
JavaScript|
CSS3|
用户登录
还没有账号?立即注册
用户注册
投稿取消
文章分类: |
|
还能输入300字
上传中....