更快获取腾讯招聘职位信息|Python 主题月

本文正在参加「Python主题月」,详情查看 活动链接

工作准备

  上一章节中,介绍了如何获取更好的获取影片资源,同时也介绍了如何通过静态获取的方式去得到我们想要的信息。这里再通过动态接口的方式进行获取工作岗位信息!!!

本地运行环境也是基于docker,搭建的粗细节,小伙伴们可以查看下上一篇文章介绍 ==>传送门

代码编写

需求分析

   在打开腾讯招聘信息的页面的时候,打开页面元素,发现我们没有在页面元素中找到我们想要的数据信息,查看接口,发现信息都是在接口进行获取的,于是,我们这次则通过接口获取我们想要的信息。

  1. 查看首页根据关键词查找分页的url翻页规律。
  2. 查看分页数据接口返回值中的信息,进行摘取。
  3. 查看详情页接口信息,对返回值进行摘取。

编写代码

  1. 首选定义函数入口,将对应分页信息中关键信息进行提取。
            for page_num in range(1, 2):
		print('开始爬取第{}页数据'.format(page_num + 1))
		# 1.每一页的地址
		url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1625731961957&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'.format(page_num)

		# 2.获取【当前页】所有职位的【详情页面的url】
		detail_urls = get_jo_detail_urls(url)

		# 3.一个个去解析详情页面的数据
		for detail_url in detail_urls:
			position = get_detail_msg(detail_url)
			positions.append(position)

		time.sleep(1)
复制代码
  1. 解析返回值并拼装详情页数据返回,并获取返回值中信息。
def get_detail_msg(detail_id):
    
	position = {}
	detail_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1625794375072&postId={}&language=zh-cn'.format(detail_id)
	response = requests.get(detail_url, headers=HEADERS)
	json_obj = json.loads(response.text)
    
	# print('请求的详细地址是:' + detail_url)
	response = requests.get(detail_url, headers=HEADERS)

	# 【数据】获取职位标题
	position['title'] = json_obj['Data']['RecruitPostName']

	# 【数据】工作地点/职位类别
	position['location'] = json_obj['Data']['LocationName']
	position['category'] = json_obj['Data']['CategoryName']

	# 【数据】工作职责
	position['duty'] = json_obj['Data']['Responsibility']

	# 【数据】工作要求
	position['ask'] = json_obj['Data']['Requirement']

	return position


def get_jo_detail_urls(page_url):
    a = set('')
    response = requests.get(page_url, headers=HEADERS)
    json_obj = json.loads(response.text)
    for item in json_obj['Data']['Posts']:
        a.add(item['PostId'])
    print(a)   
    return a
复制代码

效果截图

image.png

完整代码

import requests
import time
import json


HEADERS = {
	'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36',
	'Referer': 'https://careers.tencent.com/search.html?keywords=python&lid=0&tid=0&start=1',
	'Cookie': 'pgv_pvi=9905274880; _ga=GA1.2.134754307.1606182211; pgv_pvid=3632371128; pgv_info=ssid=s598319774; _gcl_au=1.1.1062400509.1622338581; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22100019473226%22%2C%22first_id%22%3A%226ab28e9051a5f99e96cec737ad4367a7%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%2217a5f65aa69497-0a4a94eb345f15-34657601-1296000-17a5f65aa6ad9e%22%7D; loading=agree'
}


def get_detail_msg(detail_id):
    
	position = {}
	detail_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1625794375072&postId={}&language=zh-cn'.format(detail_id)
	response = requests.get(detail_url, headers=HEADERS)
	json_obj = json.loads(response.text)
    
	# print('请求的详细地址是:' + detail_url)
	response = requests.get(detail_url, headers=HEADERS)

	# 【数据】获取职位标题
	position['title'] = json_obj['Data']['RecruitPostName']

	# 【数据】工作地点/职位类别
	position['location'] = json_obj['Data']['LocationName']
	position['category'] = json_obj['Data']['CategoryName']

	# 【数据】工作职责
	position['duty'] = json_obj['Data']['Responsibility']

	# 【数据】工作要求
	position['ask'] = json_obj['Data']['Requirement']

	return position



def get_jo_detail_urls(page_url):
    a = set('')
    response = requests.get(page_url, headers=HEADERS)
    json_obj = json.loads(response.text)
    for item in json_obj['Data']['Posts']:
        a.add(item['PostId'])
    return a


def spider():
	# 0.待返回的职位数据
	positions = []

	for page_num in range(1, 2):
		print('开始爬取第{}页数据'.format(page_num + 1))
		# 1.每一页的地址
		url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1625731961957&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'.format(page_num)

		# 2.获取【当前页】所有职位的【详情页面的url】
		detail_urls = get_jo_detail_urls(url)

		# 3.一个个去解析详情页面的数据
		for detail_url in detail_urls:
			position = get_detail_msg(detail_url)
			positions.append(position)

		time.sleep(1)

	print(positions)
	print('爬取完成!')

if __name__ == '__main__':
	spider()
复制代码
© 版权声明
THE END
喜欢就支持一下吧
点赞0 分享