本文正在参加「Python主题月」,详情查看 活动链接
工作准备
上一章节中,介绍了如何获取更好的获取影片资源,同时也介绍了如何通过静态获取的方式去得到我们想要的信息。这里再通过动态接口的方式进行获取工作岗位信息!!!
本地运行环境也是基于docker
,搭建的粗细节,小伙伴们可以查看下上一篇文章介绍 ==>传送门
代码编写
需求分析
在打开腾讯招聘信息的页面的时候,打开页面元素,发现我们没有在页面元素中找到我们想要的数据信息,查看接口,发现信息都是在接口进行获取的,于是,我们这次则通过接口获取我们想要的信息。
- 查看首页根据关键词查找分页的
url
翻页规律。 - 查看分页数据接口返回值中的信息,进行摘取。
- 查看详情页接口信息,对返回值进行摘取。
编写代码
- 首选定义函数入口,将对应分页信息中关键信息进行提取。
for page_num in range(1, 2):
print('开始爬取第{}页数据'.format(page_num + 1))
# 1.每一页的地址
url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1625731961957&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'.format(page_num)
# 2.获取【当前页】所有职位的【详情页面的url】
detail_urls = get_jo_detail_urls(url)
# 3.一个个去解析详情页面的数据
for detail_url in detail_urls:
position = get_detail_msg(detail_url)
positions.append(position)
time.sleep(1)
复制代码
- 解析返回值并拼装详情页数据返回,并获取返回值中信息。
def get_detail_msg(detail_id):
position = {}
detail_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1625794375072&postId={}&language=zh-cn'.format(detail_id)
response = requests.get(detail_url, headers=HEADERS)
json_obj = json.loads(response.text)
# print('请求的详细地址是:' + detail_url)
response = requests.get(detail_url, headers=HEADERS)
# 【数据】获取职位标题
position['title'] = json_obj['Data']['RecruitPostName']
# 【数据】工作地点/职位类别
position['location'] = json_obj['Data']['LocationName']
position['category'] = json_obj['Data']['CategoryName']
# 【数据】工作职责
position['duty'] = json_obj['Data']['Responsibility']
# 【数据】工作要求
position['ask'] = json_obj['Data']['Requirement']
return position
def get_jo_detail_urls(page_url):
a = set('')
response = requests.get(page_url, headers=HEADERS)
json_obj = json.loads(response.text)
for item in json_obj['Data']['Posts']:
a.add(item['PostId'])
print(a)
return a
复制代码
效果截图
完整代码
import requests
import time
import json
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
'Referer': 'https://careers.tencent.com/search.html?keywords=python&lid=0&tid=0&start=1',
'Cookie': 'pgv_pvi=9905274880; _ga=GA1.2.134754307.1606182211; pgv_pvid=3632371128; pgv_info=ssid=s598319774; _gcl_au=1.1.1062400509.1622338581; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22100019473226%22%2C%22first_id%22%3A%226ab28e9051a5f99e96cec737ad4367a7%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%2217a5f65aa69497-0a4a94eb345f15-34657601-1296000-17a5f65aa6ad9e%22%7D; loading=agree'
}
def get_detail_msg(detail_id):
position = {}
detail_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1625794375072&postId={}&language=zh-cn'.format(detail_id)
response = requests.get(detail_url, headers=HEADERS)
json_obj = json.loads(response.text)
# print('请求的详细地址是:' + detail_url)
response = requests.get(detail_url, headers=HEADERS)
# 【数据】获取职位标题
position['title'] = json_obj['Data']['RecruitPostName']
# 【数据】工作地点/职位类别
position['location'] = json_obj['Data']['LocationName']
position['category'] = json_obj['Data']['CategoryName']
# 【数据】工作职责
position['duty'] = json_obj['Data']['Responsibility']
# 【数据】工作要求
position['ask'] = json_obj['Data']['Requirement']
return position
def get_jo_detail_urls(page_url):
a = set('')
response = requests.get(page_url, headers=HEADERS)
json_obj = json.loads(response.text)
for item in json_obj['Data']['Posts']:
a.add(item['PostId'])
return a
def spider():
# 0.待返回的职位数据
positions = []
for page_num in range(1, 2):
print('开始爬取第{}页数据'.format(page_num + 1))
# 1.每一页的地址
url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1625731961957&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'.format(page_num)
# 2.获取【当前页】所有职位的【详情页面的url】
detail_urls = get_jo_detail_urls(url)
# 3.一个个去解析详情页面的数据
for detail_url in detail_urls:
position = get_detail_msg(detail_url)
positions.append(position)
time.sleep(1)
print(positions)
print('爬取完成!')
if __name__ == '__main__':
spider()
复制代码
© 版权声明
文章版权归作者所有,未经允许请勿转载。
THE END