  • 数据读取

  • 数据整理

  • 对职位数量在全国主要城市的分布情况进行分析

  • 对全国范围内的职位月薪情况进行分析

  • 对该职位招聘岗位要求描述进行词云图分析,获取频率最高的关键字

  • 选取两个城市,分别分析月薪分布情况以及招聘要求的词云图分析


  1. import pymongo

  2. import pandas as pd

  3. import matplotlib.pyplot as plt

  4. import numpy as np

  5. % matplotlib inline

  6. plt.style.use('ggplot')

  1. # 解决matplotlib显示中文问题

  2. plt.rcParams['font.sans-serif'] = ['SimHei']  # 指定默认字体

  3. plt.rcParams['axes.unicode_minus'] = False  # 解决保存图像是负号'-'显示为方块的问题

1 读取数据

  1. client = pymongo.MongoClient('localhost')

  2. db = client['zhilian']

  3. table = db['python']

  4. columns = ['zwmc',

  5.           'gsmc',

  6.           'zwyx',

  7.           'gbsj',

  8.           'gzdd',

  9.           'fkl',

  10.           'brief',

  11.           'zw_link',

  12.           '_id',

  13.           'save_date']

  14. # url_set =  set([records['zw_link'] for records in table.find()])

  15. # print(url_set)

  16. df = pd.DataFrame([records for records in table.find()], columns=columns)

  17. # columns_update = ['职位名称',

  18. #                   '公司名称',

  19. #                   '职位月薪',

  20. #                   '公布时间',

  21. #                   '工作地点',

  22. #                   '反馈率',

  23. #                   '招聘简介',

  24. #                   '网页链接',

  25. #                   '_id',

  26. #                   '信息保存日期']

  27. # df.columns = columns_update

  28. print('总行数为:{}行'.format(df.shape[0]))

  29. df.head(2)


2 数据整理

2.1 将str格式的日期变为 datatime

  1. df['save_date'] = pd.to_datetime(df['save_date'])

  2. print(df['save_date'].dtype)

  3. # df['save_date']

  1. datetime64[ns]

2.2 筛选月薪格式为“XXXX-XXXX”的信息

  1. df_clean = df[['zwmc',

  2.           'gsmc',

  3.           'zwyx',

  4.           'gbsj',

  5.           'gzdd',

  6.           'fkl',

  7.           'brief',

  8.           'zw_link',

  9.           'save_date']]

  10. # 对月薪的数据进行筛选,选取格式为“XXXX-XXXX”的信息,方面后续分析

  11. df_clean = df_clean[df_clean['zwyx'].str.contains('\d+-\d+', regex=True)]

  12. print('总行数为:{}行'.format(df_clean.shape[0]))

  13. # df_clean.head()

  1. 总行数为:22605

2.3 分割月薪字段,分别获取月薪的下限值和上限值

  1. # http://stackoverflow.com/questions/14745022/pandas-dataframe-how-do-i-split-a-column-into-two

  2. # http://stackoverflow.com/questions/20602947/append-column-to-pandas-dataframe

  3. # df_temp.loc[: ,'zwyx_min'],df_temp.loc[: , 'zwyx_max'] = df_temp.loc[: , 'zwyx'].str.split('-',1).str #会有警告

  4. s_min, s_max = df_clean.loc[: , 'zwyx'].str.split('-',1).str

  5. df_min = pd.DataFrame(s_min)

  6. df_min.columns = ['zwyx_min']

  7. df_max = pd.DataFrame(s_max)

  8. df_max.columns = ['zwyx_max']

  9. df_clean_concat = pd.concat([df_clean, df_min, df_max], axis=1)

  10. # df_clean['zwyx_min'].astype(int)

  11. df_clean_concat['zwyx_min'] = pd.to_numeric(df_clean_concat['zwyx_min'])

  12. df_clean_concat['zwyx_max'] = pd.to_numeric(df_clean_concat['zwyx_max'])

  13. # print(df_clean['zwyx_min'].dtype)

  14. print(df_clean_concat.dtypes)

  15. df_clean_concat.head(2)


  • 将数据信息按职位月薪进行排序

  1. df_clean_concat.sort_values('zwyx_min',inplace=True)

  2. # df_clean_concat.tail()

  • 判断爬取的数据是否有重复值

  1. # 判断爬取的数据是否有重复值

  2. print(df_clean_concat[df_clean_concat.duplicated('zw_link')==True])

  1. Empty DataFrame

  2. Columns: [zwmc, gsmc, zwyx, gbsj, gzdd, fkl, brief, zw_link, save_date, zwyx_min, zwyx_max]

  3. Index: []

  • 从上述结果可看出,数据是没有重复的。

3 对全国范围内的职位进行分析

3.1 主要城市的招聘职位数量分布情况

  1. # from IPython.core.display import display, HTML

  2. ADDRESS = [ '北京', '上海', '广州', '深圳',

  3.           '天津', '武汉', '西安', '成都', '大连',

  4.           '长春', '沈阳', '南京', '济南', '青岛',

  5.           '杭州', '苏州', '无锡', '宁波', '重庆',

  6.           '郑州', '长沙', '福州', '厦门', '哈尔滨',

  7.           '石家庄', '合肥', '惠州', '太原', '昆明',

  8.           '烟台', '佛山', '南昌', '贵阳', '南宁']

  9. df_city = df_clean_concat.copy()

  10. # 由于工作地点的写上,比如北京,包含许多地址为北京-朝阳区等

  11. # 可以用替换的方式进行整理,这里用pandas的replace()方法

  12. for city in ADDRESS:

  13.    df_city['gzdd'] = df_city['gzdd'].replace([(city+'.*')],[city],regex=True)

  14. # 针对全国主要城市进行分析

  15. df_city_main = df_city[df_city['gzdd'].isin(ADDRESS)]

  16. df_city_main_count = df_city_main.groupby('gzdd')['zwmc','gsmc'].count()

  17. df_city_main_count['gsmc'] = df_city_main_count['gsmc']/(df_city_main_count['gsmc'].sum())

  18. df_city_main_count.columns = ['number', 'percentage']

  19. # 按职位数量进行排序

  20. df_city_main_count.sort_values(by='number', ascending=False, inplace=True)

  21. # 添加辅助列,标注城市和百分比,方面在后续绘图时使用

  22. df_city_main_count['label']=df_city_main_count.index+ ' '+  ((df_city_main_count['percentage']*100).round()).astype('int').astype('str')+'%'

  23. print(type(df_city_main_count))

  24. # 职位数量最多的Top10城市的列表

  25. print(df_city_main_count.head(10))

  1. <class 'pandas.core.frame.DataFrame'>

  2.      number  percentage   label

  3. gzdd                            

  4. 北京      6936    0.315948  北京 32%

  5. 上海      3213    0.146358  上海 15%

  6. 深圳      1908    0.086913   深圳 9%

  7. 成都      1290    0.058762   成都 6%

  8. 杭州      1174    0.053478   杭州 5%

  9. 广州      1167    0.053159   广州 5%

  10. 南京       826    0.037626   南京 4%

  11. 郑州       741    0.033754   郑州 3%

  12. 武汉       552    0.025145   武汉 3%

  13. 西安       473    0.021546   西安 2%

  • 对结果进行绘图:

  1. from  matplotlib import cm

  2. label = df_city_main_count['label']

  3. sizes = df_city_main_count['number']

  4. # 设置绘图区域大小

  5. fig, axes = plt.subplots(figsize=(10,6),ncols=2)

  6. ax1, ax2 = axes.ravel()

  7. colors = cm.PiYG(np.arange(len(sizes))/len(sizes)) # colormaps: Paired, autumn, rainbow, gray,spring,Darks

  8. # 由于城市数量太多,饼图中不显示labels和百分比

  9. patches, texts = ax1.pie(sizes,labels=None, shadow=False, startangle=0, colors=colors)

  10. ax1.axis('equal')  

  11. ax1.set_title('职位数量分布', loc='center')

  12. # ax2 只显示图例(legend)

  13. ax2.axis('off')

  14. ax2.legend(patches, label, loc='center left', fontsize=9)

  15. plt.savefig('job_distribute.jpg')

  16. plt.show()


3.2 月薪分布情况(全国)

  1. from matplotlib.ticker import FormatStrFormatter

  2. fig, (ax1, ax2) = plt.subplots(figsize=(10,8), nrows=2)

  3. x_pos = list(range(df_clean_concat.shape[0]))

  4. y1 = df_clean_concat['zwyx_min']

  5. ax1.plot(x_pos, y1)

  6. ax1.set_title('Trend of min monthly salary in China', size=14)

  7. ax1.set_xticklabels('')

  8. ax1.set_ylabel('min monthly salary(RMB)')

  9. bins = [3000,6000, 9000, 12000, 15000, 18000, 21000, 24000, 100000]

  10. counts, bins, patches = ax2.hist(y1, bins, normed=1, histtype='bar', facecolor='g', rwidth=0.8)

  11. ax2.set_title('Hist of min monthly salary in China', size=14)

  12. ax2.set_yticklabels('')

  13. # ax2.set_xlabel('min monthly salary(RMB)')

  14. # http://stackoverflow.com/questions/6352740/matplotlib-label-each-bin

  15. ax2.set_xticks(bins) #将bins设置为xticks

  16. ax2.set_xticklabels(bins, rotation=-90) # 设置为xticklabels的方向

  17. # Label the raw counts and the percentages below the x-axis...

  18. bin_centers = 0.5 * np.diff(bins) + bins[:-1]

  19. for count, x in zip(counts, bin_centers):

  20. #     # Label the raw counts

  21. #     ax2.annotate(str(count), xy=(x, 0), xycoords=('data', 'axes fraction'),

  22. #         xytext=(0, -70), textcoords='offset points', va='top', ha='center', rotation=-90)

  23.    # Label the percentages

  24.    percent = '%0.0f%%' % (100 * float(count) / counts.sum())

  25.    ax2.annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),

  26.        xytext=(0, -40), textcoords='offset points', va='top', ha='center', rotation=-90, color='b', size=14)

  27. fig.savefig('salary_quanguo_min.jpg')



  1. df_zwyx_adjust = df_clean_concat[df_clean_concat['zwyx_min']<=20000]

  2. fig, (ax1, ax2) = plt.subplots(figsize=(10,8), nrows=2)

  3. x_pos = list(range(df_zwyx_adjust.shape[0]))

  4. y1 = df_zwyx_adjust['zwyx_min']

  5. ax1.plot(x_pos, y1)

  6. ax1.set_title('Trend of min monthly salary in China (adjust)', size=14)

  7. ax1.set_xticklabels('')

  8. ax1.set_ylabel('min monthly salary(RMB)')

  9. bins = [3000,6000, 9000, 12000, 15000, 18000, 21000]

  10. counts, bins, patches = ax2.hist(y1, bins, normed=1, histtype='bar', facecolor='g', rwidth=0.8)

  11. ax2.set_title('Hist of min monthly salary in China (adjust)', size=14)

  12. ax2.set_yticklabels('')

  13. # ax2.set_xlabel('min monthly salary(RMB)')

  14. # http://stackoverflow.com/questions/6352740/matplotlib-label-each-bin

  15. ax2.set_xticks(bins) #将bins设置为xticks

  16. ax2.set_xticklabels(bins, rotation=-90) # 设置为xticklabels的方向

  17. # Label the raw counts and the percentages below the x-axis...

  18. bin_centers = 0.5 * np.diff(bins) + bins[:-1]

  19. for count, x in zip(counts, bin_centers):

  20. #     # Label the raw counts

  21. #     ax2.annotate(str(count), xy=(x, 0), xycoords=('data', 'axes fraction'),

  22. #         xytext=(0, -70), textcoords='offset points', va='top', ha='center', rotation=-90)

  23.    # Label the percentages

  24.    percent = '%0.0f%%' % (100 * float(count) / counts.sum())

  25.    ax2.annotate(percent, xy=(x, 0), xycoords=('data', 'axes fraction'),

  26.        xytext=(0, -40), textcoords='offset points', va='top', ha='center', rotation=-90, color='b', size=14)

  27. fig.savefig('salary_quanguo_min_adjust.jpg')


3.3 相关技能要求

  1. brief_list = list(df_clean_concat['brief'])

  2. brief_str = ''.join(brief_list)

  3. print(type(brief_str))

  4. # print(brief_str)

  5. # with open('brief_quanguo.txt', 'w', encoding='utf-8') as f:

  6. #     f.write(brief_str)

  1. <class 'str'>


  1. # -*- coding: utf-8 -*-

  2. """

  3. Created on Wed May 17 2017

  4. @author: lemon

  5. """

  6. import jieba

  7. from wordcloud import WordCloud, ImageColorGenerator

  8. import matplotlib.pyplot as plt

  9. import os

  10. import PIL.Image as Image

  11. import numpy as np

  12. with open('brief_quanguo.txt', 'rb') as f: # 读取文件内容

  13.    text = f.read()

  14.    f.close()

  15. # 首先使用 jieba 中文分词工具进行分词

  16. wordlist = jieba.cut(text, cut_all=False)      

  17. # cut_all, True为全模式,False为精确模式

  18. wordlist_space_split = ' '.join(wordlist)

  19. d = os.path.dirname(__file__)

  20. alice_coloring = np.array(Image.open(os.path.join(d,'colors.png')))

  21. my_wordcloud = WordCloud(background_color='#F0F8FF', max_words=100, mask=alice_coloring,

  22.                         max_font_size=300, random_state=42).generate(wordlist_space_split)

  23. image_colors = ImageColorGenerator(alice_coloring)

  24. plt.show(my_wordcloud.recolor(color_func=image_colors))

  25. plt.imshow(my_wordcloud)            # 以图片的形式显示词云

  26. plt.axis('off')                     # 关闭坐标轴

  27. plt.show()

  28. my_wordcloud.to_file(os.path.join(d, 'brief_quanguo_colors_cloud.png'))


4 北京

4.1 月薪分布情况

  1. df_beijing = df_clean_concat[df_clean_concat['gzdd'].str.contains('北京.*', regex=True)]

  2. df_beijing.to_excel('zhilian_kw_python_bj.xlsx')

  3. print('总行数为:{}行'.format(df_beijing.shape[0]))

  4. # df_beijing.head()

  1. 总行数为:6936


4.2 相关技能要求

  1. brief_list_bj = list(df_beijing['brief'])

  2. brief_str_bj = ''.join(brief_list_bj)

  3. print(type(brief_str_bj))

  4. # print(brief_str_bj)

  5. # with open('brief_beijing.txt', 'w', encoding='utf-8') as f:

  6. #     f.write(brief_str_bj)

  1. <class 'str'>


5 长沙

5.1 月薪分布情况

  1. df_changsha = df_clean_concat[df_clean_concat['gzdd'].str.contains('长沙.*', regex=True)]

  2. # df_changsha = pd.DataFrame(df_changsha, ignore_index=True)

  3. df_changsha.to_excel('zhilian_kw_python_cs.xlsx')

  4. print('总行数为:{}行'.format(df_changsha.shape[0]))

  5. # df_changsha.tail()

  1. 总行数为:280


5.2 相关技能要求

  1. brief_list_cs = list(df_changsha['brief'])

  2. brief_str_cs = ''.join(brief_list_cs)

  3. print(type(brief_str_cs))

  4. # print(brief_str_cs)

  5. # with open('brief_changsha.txt', 'w', encoding='utf-8') as f:

  6. #     f.write(brief_str_cs)

  1. <class 'str'>




