其他
用Python爬取3万多条评论,看韩国人如何评价电影《寄生虫》?
The following article is from 凹凸数据 Author 朱小五
def main():
data = []
for i in range(1,200): #爬取多少页
url = 'https://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code=161967&type=after&onlyActualPointYn=N&onlySpoilerPointYn=N&order=newest&page='+str(i)
print('准备采集第{}页数据'.format(i))
html = restaurant(url)
doc = pq(html)
for i in range(0,10):
print(i)
dic = {}
dic['star'] = doc('li:nth-child(' + str(i+1) +') > div.star_score > em').text()
dic['text'] = doc('#_filtered_ment_' + str(i)).text()
dic['datetime'] = doc('li:nth-child(' + str(i+1) +') > div.score_reple > dl > dt > em:nth-child(2)').text()
dic['name'] = doc('li:nth-child(' + str(i+1) +') > div.score_reple > dl > dt > em:nth-child(1) > a').text()
dic['zan'] = doc('li:nth-child(' + str(i+1) +') > div.btn_area > a._sympathyButton > strong').text()
dic['cai'] = doc('li:nth-child(' + str(i+1) +') > div.btn_area > a._notSympathyButton > strong').text()
data.append(dic)
#time.sleep(random.random())
pd.DataFrame(data).to_csv('寄生虫评论.csv',encoding="utf_8",index = False)
return data
def data_cleaning(df):
cols = df.columns
for col in cols:
if df[col].dtype == 'object':
df[col].fillna('缺失数据', inplace = True)
else:
df[col].fillna(0, inplace = True)
return(df)
def translate(text):
url = 'http://fanyi.youdao.com/translate?&doctype=json&type=KR2ZH_CN&i='+ text
requ_text = requests.get(url)
json_text = requ_text.json()
data = json_text['translateResult'][0][0]['tgt']
time.sleep(2+random.random())
print('翻译中')
return data
ata['text_t'] = data.apply(lambda x :translate(x['text']), axis=1)
plt.figure(figsize=(8,5), dpi=200)
x = score_by_time.index.date
y = score_by_time.values
plt.plot(x,y,c="g",marker=".",ls="-")
plt.title("韩国电影《寄生虫》评论数走势图")
plt.xlabel("月份")
plt.ylabel("评论数")
即日起至 3月21日,千万流量支持原创作者,更有专属【勋章】等你来挑战
☞用于小型图形挖掘研究的瑞士军刀:空手道俱乐部的图表学习Python库
☞Libra新编程语言 :Move 的所有权模型灵感来源原来是它……