2018/2/2 21:03:08当前位置推荐好文程序员浏览文章

找工作啊找工作

年关将至,相信很多小伙伴在寻思着发展方向。不知有没小伙伴跟笔者找工作时狂刷简历,为的是找出离家近点的公司,但是有很多公司的简介上是没有地址的,所以也只能再点进去看公司详细地址。
因此,写了个爬虫,方便找工作。

项目简介

主要代码是一个job_spider.py(用于爬虫), jobs_data_analyse.py(用于工作数据分析)
笔者先获取工作列表,取得简介后再取得详情。下载完成后,再进行分析。

详细代码如下

job_spider.py

from bs4 import BeautifulSoupimport requestsimport osfrom enum import Enumfrom program import configimport pandas as pdpd.set_option(expand_frame_repr, False)  # 列太多不换行class WEBTYPE(Enum):    _51job = _51job  # 51job    zhilian = zhilian  # 智联    all = 3  # 所有#全局变量  记录爬虫次数SPIDER_REQUIRE_= 0#获取51job地址编号对应地名def get_51job_area_code():    dic = {}    for i in range(1, 37):        url = http://search.51job.com/list/{}0000,000000,0000,00,9,99,ios,2,1.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=.format(%02d % i)        r = requests.get(url, headers=config.http_headers).content.decode(gbk)        area_name = BeautifulSoup(r, lxml).find(id="work_position_input")[value]        print(area_name, i)        dic[area_name] = i    file_path = os.path.join(config.job_data_dir, 51job_area_code.txt)    print(51job地区编号文件获取成功)    with open(file_path, "w+", encoding="utf-8") as f:        f.write(str(dic))        f.close()#  检查本地是否有51job地区编号 没有的话就自动获取def check_area_name():    file_path = os.path.join(config.job_data_dir, 51job_area_code.txt)    if os.path.exists(file_path):        with open(file_path, "r", encoding="utf-8") as f:            if f:                result = f.read()                dic = eval(result)                f.close()    else:        print(51job缺少地区编号文件,获取中)        get_51job_area_code()        check_area_name()def fetch_data( web_type=WEBTYPE.all, keywords=[iOS], page_count=5, area=深圳):    if os.path.exists(config.jobs_data_path):        os.remove(config.jobs_data_path)        print(删除之前爬的数据)    if web_type == WEBTYPE._51job:        _fetch_data(web_type, keywords, page_count, area)    elif web_type == WEBTYPE.zhilian:        _fetch_data(web_type, keywords, page_count, area)    elif web_type == WEBTYPE.all:        for type in list(WEBTYPE)[0: -1]:            _fetch_data(type, keywords, page_count, area)def _fetch_data(web_type, keywords, page_count, area):    df = fetch_job_introduce(web_type, keywords, page_count, area)    df = fetch_job_detail(df)    df.fillna(value=, inplace=True)    if os.path.exists(config.jobs_data_path):        df_existed = pd.read_csv(config.jobs_data_path, encoding=utf-8, index_col=0)        df = df.append(df_existed, ignore_index=True)    df.sort_values(by=[地区], inplace=True)    df.reset_index(drop=True, inplace=True)    df.to_csv(config.jobs_data_path, mode=w, encoding=utf-8)    #去除工作要求 方便查看    df_no_require = df.drop([要求], axis=1)    df_no_require[薪酬] = df_no_require[薪酬].apply(_make_introduce_beautiful, min_length=12)    df_no_require[地区] = df_no_require[地区].apply(_make_introduce_beautiful, min_length=12)    df_no_require[详细地址] = df_no_require[详细地址].apply(_make_introduce_beautiful, min_length=30)    df_no_require[链接] = df_no_require[链接].apply(_make_introduce_beautiful, min_length=60)    df_no_require.to_csv(config.jobs_data_introduce_path, mode=w, encoding=utf-8)# 让简介好看点  左对齐并留空def _make_introduce_beautiful(txt, min_length):    try:        return txt.ljust(min_length)    except Exception as e:        print(e)        return .ljust(min_length)# 获取工作简介def fetch_job_introduce(web_type, keywords, page_count, area):    url = ""    decode_type = ""    #根据不同网站设置不同的地址格式    area_need = ""    if web_type == WEBTYPE._51job:        url = "http://search.51job.com/list/{}0000,000000" \              ",0000,00,9,99,{},2,{}.html? lang=c&stype=1&postchannel=0000&workyear=99&" \              "cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0" \              "&confirmdate=9&fromType=1&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="        decode_type = gbk        file_path = os.path.join(config.job_data_dir, 51job_area_code.txt)        with open(file_path, mode=r, encoding=utf-8) as f:            result = f.read()            dic = eval(result)            area_need = %02d % dic[area]    elif web_type == WEBTYPE.zhilian:        url = "http://sou.zhaopin.com/jobs/searchresult.ashx?jl={}&kw={}&isadv=0&sg=7e9e61449fd14593a5604fff81aec46a&p={}"        decode_type = "utf-8"    # 实际页数从1开始,所以+1    urls = [url.format(area_need, .join(keywords), p+1) for p in range(0, page_count)]    df = fetch_companies(urls, decode_type, web_type)    return dfdef fetch_companies(urls, decode_type, web_type):    df = pd.DataFrame(columns=[薪酬, 地区, 详细地址, 链接, 工作, 公司, 来源, 要求])    # 要页数从0开始    for url in urls:        r = requests.get(url, headers=config.http_headers).content.decode(decode_type)        if web_type == WEBTYPE._51job:            bs = BeautifulSoup(r, lxml).find("div", class_="dw_table").find_all("div", class_="el")            for b in bs:                try:                    href, job_name = b.find(a)[href], b.find(a)[title]                    company_name = b.find(span, class_=t2).text                    locate = b.find(span, class_=t3).text                    salary = b.find(span, class_=t4).text                    dic = {工作: company_name,                           地区: locate,                           详细地址: ,                           薪酬: salary,                           公司: job_name,                           链接: href,                           来源: web_type.value,                           要求: }                    index = df.shape[0]                    df.loc[index] = dic                    # print(df)                except Exception as e:                    print(e, "简介解析错误")                    pass        elif web_type == WEBTYPE.zhilian:            bs = BeautifulSoup(r, lxml).find(id="newlist_list_content_table").find_all("table",class_="newlist")            for b in bs:                try:                    # 第一个标签没有信息                    href = b.find("td", class_="zwmc").find("div").find("a")["href"]                    company_name = b.find("td", class_="zwmc").find("div").find("a").text                    job_name = b.find("td", class_=gsmc).find("a").text                    locate = b.find("td", class_="gzdd").text                    salary = b.find("td", class_="zwyx").text                    dic = {工作: company_name,                           地区: locate,                           详细地址: ,                           薪酬: salary,                           公司: job_name,                           链接: href,                           来源: web_type.value,                           要求: }                    index = df.shape[0]                    df.loc[index] = dic                    # print(df)                except Exception as e:                    print(e, "简介解析错误")                    pass    return df# 获取工作详情def fetch_job_detail(df):    for i in  range(0, df.shape[0]):        introduce = df.loc[i]        location, require = _fetch_location_and_require_from_detail(introduce)        df.loc[i, 详细地址] = location        df.loc[i, 要求] = require    return df# 获取详细地址与工作要求详情def _fetch_location_and_require_from_detail(introduce):    global SPIDER_REQUIRE_   web_type = introduce[来源]    href = introduce[链接]    company_name = introduce[公司]    if web_type == WEBTYPE._51job.value:        SPIDER_REQUIRE_+= 1        print("正在爬第{}个公司{}的要求\n{}".format(SPIDER_REQUIRE_COUNT, company_name, href))        try:            r = requests.get(href, headers=config.http_headers).content.decode("gbk")            location_detail = _fetch_location_from_detail(r, introduce)            bs = BeautifulSoup(r, lxml).find(div, class_="bmsg job_msg inbox")            useless_bs1 = bs.find(p, class_=fp)            useless_bs2 = bs.find(div, class_=share)            require = bs.text.replace(useless_bs1.text, ).replace(useless_bs2.text, )\                .replace("\t", "").replace("\n", "").replace("\r", "")            return location_detail, require        except Exception as e:            print(e, "工作要求解析错误")            return "", ""            pass    elif web_type == WEBTYPE.zhilian.value:        SPIDER_REQUIRE_+= 1        print("正在爬第{}个公司{}的要求\n{}".format(SPIDER_REQUIRE_COUNT, company_name, href))        try:            r = requests.get(href, headers=config.http_headers).content.decode("utf-8")            location_detail = _fetch_location_from_detail(r, introduce)            bs = BeautifulSoup(r, lxml).find(div, class_="tab-inner-cont")            useless_bs1 = bs.find(b)            useless_bs2 = bs.find(h2)            useless_bs3 = bs.find(id=applyVacButton1)            require = bs.text.replace(useless_bs1.text, ).replace(useless_bs2.text, ).replace(useless_bs3.text, )\                .replace("\t", "").replace("\n", "").replace("\r", "")            return location_detail, require        except Exception as e:            print(e, "工作要求解析错误")            return "", ""            pass#获取详细地址def _fetch_location_from_detail(h5_content, introduce):    """获取公司详细地址"""    web_type = introduce[来源]    if web_type == WEBTYPE._51job.value:        bs = BeautifulSoup(h5_content, lxml).find_all(p, class_="fp")        for b in bs:            try:                location = b.text                if "上班地址" in location:                    location = location.replace("上班地址:", "").replace("\t", "").replace("\n", "")                    return location            except Exception as e:                print(e, 上班地址解析错误)                return introduce[地区]                pass    elif web_type == WEBTYPE.zhilian.value:        bs = BeautifulSoup(h5_content, lxml).find(div, class_="tab-inner-cont")        try:            location = bs.find("h2").text            location = location.replace("\t", "").replace("\n", "").replace("\r", "").replace(" ", "").replace("查看职位地图", "")            return location        except Exception as e:            print(e, 上班地址解析错误)            return introduce[地区]            pass

jobs_data_analyse.py

import osfrom program import configimport pandas as pdimport mathimport jiebaimport jieba.possegimport csvimport matplotlib.pyplot as pltfrom program.job_spider import import numpy as npfrom PIL import Imagefrom collections import Counterfrom wordcloud import WordCloudpd.set_option(expand_frame_repr, False)def jobs_data_analyse():    df = pd.read_csv(config.jobs_data_path, encoding=utf-8)    df[薪酬] = df[薪酬].apply(unify_salary_form)    salary_analyse(df)    require_analyse(df)#统一工资格式def unify_salary_form(salary):    if type(salary) == float math.isnan(salary):        return None    month = 1    if salary.endswith(/年):        month = 12        salary = salary.replace(/年, )    elif salary.endswith(/月):        month = 1        salary = salary.replace(/月, )    multiple = 1    if salary.endswith(千):        multiple = 1000        salary = salary.replace(千, )    elif salary.endswith(万):        multiple = 10000        salary = salary.replace(万, )    # print(salary)    try:        min = int(float(salary.split(-)[0])  multiple / month)        max = int(float(salary.split(-)[1])  multiple / month)        return str(min), str(max), str(min) + - + str(max)    except Exception as e:        print(e)        return None#分析薪酬def salary_analyse(df):    df[low_薪酬] = df[薪酬].apply(lambda x: None if(x == None) else int(x[0]))    df[high_薪酬] = df[薪酬].apply(lambda x: None if (x == None) else int(x[1]))    print(该行业平均工资为: , df.dropna(subset=[薪酬])[[low_薪酬, high_薪酬]].mean().mean())    index_max_salary = df[high_薪酬].idxmax()    index_min_salary = df[low_薪酬].idxmin()    print(最高薪酬的公司: %s 薪酬为: %d 链接如下\\n%s % (df.loc[index_max_salary, 公司], df[high_薪酬].max(), df.loc[index_max_salary, 链接]))    print(最低薪酬的公司: %s 薪酬为: %d 链接如下\\n%s % (df.loc[index_min_salary, 公司], df[low_薪酬].min(), df.loc[index_min_salary, 链接]))    for area, group in df.dropna(subset=[薪酬]).groupby(地区):        average_salary = group[[low_薪酬, high_薪酬]].mean().mean()        print(该行业在地区:(%s)的平均薪酬为:%d % (area, average_salary))#分析要求def require_analyse(df):    all_require =     for require in df[要求]:        if type(require) == float math.isnan(require):            continue        all_require += require    _require_word_freq(all_require)    _require_word_cloud()def _require_word_freq(all_require):    #设置用户词典    jieba.load_userdict(os.path.join(config.jieba_dir, "user_dict.txt"))    seg_lst = jieba.posseg.cut(all_require)    counter = Counter()    #设置停用词    stopwords_path = os.path.join(config.jieba_dir,"stopwords.txt" )    stopwords = [line.strip() for line in open(stopwords_path, "r", encoding="utf-8").readlines()]    for seg in seg_lst:        if seg.word in stopwords:            continue            #过滤符号        elif seg.flag == x:            continue        counter[seg.word] += 1    counter_sorted = sorted(counter.items(), key=lambda value: value[1], reverse=True)    with open(config.jobs_require_word_freq_path, "w+", encoding="utf-8") as f:        f_csv = csv.writer(f)        f_csv.writerows(counter_sorted)        print(词频文件保存成功,地址为:, config.jobs_require_word_freq_path)def _require_word_cloud():    word_freq_dic = dict()    with open(config.jobs_require_word_freq_path, mode=r, encoding=utf-8) as f:        f_csv = csv.reader(f)        # print(f_csv)        for row in f_csv:            word_freq_dic[row[0]] = int(row[1])        # print(word_freq_dic)    #使用图片作为背景生成wordcloud    #这里用alice的图 是从这里得来的http://blog.csdn.net/fontthrone/article/details/72775865    # alice_coloring = np.array(Image.open(config.alice_png))    # wc = WordCloud(font_path=config.wc_font_path, background_color=white, mask = alice_coloring,    #                max_words=150, max_font_size=100, min_font_size=20)\    #     .generate_from_frequencies(word_freq_dic)    wc = WordCloud(font_path=config.wc_font_path,                          max_words=150, height=800, width=1400).generate_from_frequencies(word_freq_dic)    plt.imshow(wc, interpolation="bilinear")    plt.axis(off)    plt.show()    wc.to_file(config.wordcloud_png_path)def start():    check_area_name()    fetch_data(web_type=WEBTYPE.all, keywords=[iOS], area=深圳, page_count=5)    jobs_data_analyse()start()

使用方法

打开项目文件jobs_data_analyse.py运行,可根据个人需求更改

运行后,就会开始收集数据。

数据爬虫

收集完成后,会对收集来的薪酬数据简要分析。

薪酬分析

最后会根据工作要求生成wordcloud。

深圳iOS的词频

为了方便按地区查看工作,笔者把工作简介放在jobs_data_introduce.csv,客官搜索自己要的地区进行查看。


工作简介

这个demo只是符合笔者需要,仅供参考。

demo地址

网友评论