spider.py
#encoding: utf-8 # pip install bs4 # pip install requests # sudo pip install xxx # 显示当前所有安装的库 # pip list # sudo easy_install pip import requests from bs4 import BeautifulSoup import json import time def crawl_detail(id): url = 'https://www.lagou.com/jobs/%s.html' % id headers = { 'Host': 'www.lagou.com', 'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36' } req = requests.get(url,headers=headers) soup = BeautifulSoup(req.content,'lxml') job_bt = soup.find('dd',attrs={'class':'job_bt'}) return job_bt.text def main(): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', 'Host': 'www.lagou.com', 'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=', 'X-Anit-Forge-Code': '0', 'X-Anit-Forge-Token': None, 'X-Requested-With': 'XMLHttpRequest' } form_data = { 'first': 'true', 'pn': '1', 'kd': 'python' } # result = requests.post('https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false',headers=headers,data=form_data) # json_result = result.json() # positions = json_result['content']['positionResult']['result'] positions = [] for x in range(1,5): form_data = { 'first': 'true', 'pn': x, 'kd': 'python' } result = requests.post('https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false',headers=headers,data=form_data) json_result = result.json() print json_result print '='*50 page_positions = json_result['content']['positionResult']['result'] for position in page_positions: # print position # print '-'*30 # 先把需要的信息拿到,不需要的就不要了 position_dict = { 'position_name': position['positionName'], 'work_year': position['workYear'], 'salary': position['salary'], 'district': position['district'], 'company_name': position['companyFullName'], } position_id = position['positionId'] # 拿到这个position,然后再去爬这个职位的详情页面 position_detail = crawl_detail(position_id) position_dict['position_detail'] = position_detail positions.append(position_dict) # 出现您操作太频繁,请稍后再试的解决办法。 # 1. 要么把sleep时间改大一点 # 2. 每次请求不要请求这么多,分多次请求 time.sleep(5) line = json.dumps(positions,ensure_ascii=False) with open('lagou.json','w') as fp: fp.write(line.encode('utf-8')) if __name__ == '__main__': main() # crawl_detail('3265286') # selenium+PhtonmJS/ChromeDriver
spider_selenium.py
#encoding: utf-8 from selenium import webdriver from bs4 import BeautifulSoup # chromedriver # PhantomJS driver = webdriver.PhantomJS(r'C:\Users\hynev\develop\phantomjs\bin\phantomjs.exe') driver.get('https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=') print driver.find_element_by_class_name('item_con_list').click()
本文转自:https://www.tongpankt.com/7486