0%

豆瓣小王子书评词云制作

主要的想法就是先用request.get()方法获取一些url,得到url的html后再用beautifulsoup解析这些网页,获取用户的id,name,href,commnet这四个属性,然后在用户的这些属性存入数据库,又添加了一些无聊的数据库的增删改查功能,接着从数据库取出所有的评论,一边取一边用正则清洗一些无用的评论,然后直接用列表存一下清洗后的评论(反正数据比较少),然后用jieba库来中文分词,最后用wordcloude库生成词云。ui可以自定义,我做得超丑(主要觉得没意思[摊手])
重要的是:python刚学,代码写得特别垃圾,代码写得特别垃圾,代码写得特别垃圾,内存列表都是随便开的。逃~。以后会重构的。

图片词云

1
2
3
4
5
6
7
8
import requests
from bs4 import BeautifulSoup
import os
import pymysql as Db
import jieba
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
#全局变量,存放所有用户信息
UsersPages=[]
class User:
def __init__(self, id, name='', href='', comment=''):
self.userid=id
self.username=name
self.userhref=href
self.usercomment=comment
def getusername(self):
return self.username
def getusercomment(self):
return self.usercomment
def getuserid(self):
return self.userid
def getuserhref(self):
return self.userhref
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def parse(htmluser):
'''
解析器
解析每一个用户的信息
:param htmluser:
:return:
'''
usersoup = BeautifulSoup(htmluser, 'lxml')
userid = usersoup.find('li', class_='comment-item')['data-cid']
userinfo = usersoup.find('div', class_='avatar').a
username = userinfo['title']
userhref = userinfo['href']
usercommet = usersoup.select('.short')[0].string
# print(userid, username, userhref, usercommet)
return User(userid, username, userhref, usercommet)
```
```
def webCrawler():
'''
#爬取小王子所有的评论网页信息,先存入本地.txt文件,在对文件做处理
:return:
'''
#这里range的范围可以根据网页设置
for i in range(608,620):
url='https://book.douban.com/subject/1084336/comments/hot?p='+str(i)
re=requests.get(url)
#re=requests.get(url,headers=random.choice(headers),proxies=random.choice(proxies)) # p4643
# print(headers)
# print(proxies)
print(re.status_code)
try:
nedpath = 'C:\\Users\\chend\\PycharmProjects\\nju\\collection\\'
os.chdir(nedpath)
# print(nedpath)
except Exception as err:
print(err)
with open('littlePrince'+str(i)+'.txt', 'w', encoding='utf-8') as f:
text=re.text
f.writelines(text)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def localAnalysis():
#对webCrawler模块爬到的数据做本地分析
for i in range(608,620):
Users = []
try:
nedpath = 'C:\\Users\\chend\\PycharmProjects\\nju\\collection\\'
os.chdir(nedpath)
except Exception as err:
print(err)
with open('littlePrince'+str(i)+'.txt', 'r+', encoding='utf-8') as f:
content=f.read()
soup=BeautifulSoup(content,'lxml')
userinfo=soup.find_all('li', class_='comment-item')
for htmluser in userinfo:
htmluser=str(htmluser)
user=parse(htmluser)
Users.append(user)
UsersPages.append(Users)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
def storeDB(UsersPages):
'''
存入mysql数据库
'''
connection = Db.connect(host='localhost', user='root', password='cd7089028', db='littleprince')
for userpage in UsersPages:
for user in userpage:
userid=str(user.userid).strip()
username=str(user.getusername()).strip()
userhref=str(user.getuserhref()).strip()
usercomment=str(user.getusercomment()).strip()
print(userid, username, userhref, usercomment)
try:
with connection.cursor() as cursor:
sql = "insert into user values (%s,%s,%s,%s) "
cursor.execute(sql, (userid, username, userhref, usercomment,))
connection.commit()
except Exception as err:
print(err)
connection.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
def Opdatabase(mode,user):
'''
数据库的增删改查
'''
connection = Db.connect(host='localhost', user='root', password='********', db='littleprince')

#特定用户信息查找模式
if mode == 'select':
# Read a single record
with connection.cursor() as cursor:
sql = "select * from user where userid=%s"
cursor.execute(sql, (user.getuserid(),))
result = cursor.fetchall()
for row in result:
userid=str(row[0])
username=str(row[1])
userhref=str(row[2])
usercomment=str(row[3])
#print(userid, username, userhref,usercomment)
connection.close()
return User(userid, username, userhref, usercomment)
#增加模式
if mode == 'insert':
with connection.cursor() as cursor:
sql = "insert into user values(%s,%s,%s,%s)"
userid = str(user.userid).strip()
username = str(user.getusername()).strip()
userhref = str(user.getuserhref()).strip()
usercomment = str(user.getusercomment()).strip()
print(userid, username, userhref,usercomment)
cursor.execute(sql,(userid, username, userhref, usercomment))
connection.commit()
connection.close()
#删除模式
if mode == 'delete':
with connection.cursor() as cursor:
sql = "delete from user where userid=%s"
userid=str(user.getuserid()).strip()
print(userid)
op=cursor.execute(sql, (userid))
connection.commit()
connection.close()
return op
#更新模式
if mode == 'update':
with connection.cursor() as cursor:
sql = "update user set usercomment=%s where userid=%s"
usercomment = str(input('update comment is:'))
userid=str(user.getuserid()).strip()
op=cursor.execute(sql, (usercomment, userid))
connection.commit()
connection.close()
return op
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def getAllComments():
'''
提取所有的评论存入文件comments.txt
:return:
'''
#连接数据库
conn=Db.connect(host='localhost', user='root', password='cd7089028', db='littleprince')
#取所有用户评论
with conn.cursor() as cursor:
comments=[]
sql="select usercomment from user"
try:
cursor.execute(sql)
results=cursor.fetchall()
for row in results:
#在这里需要用正则表达式对评论做一次清洗:
#只保留中英文评论和超过四个汉字的评论
pattern=re.compile('[\u4e00-\u9fa5_a-zA-Z0-9_\\s_,''"~:\-\._,:。‘’“”?!]{4,150}')
match=pattern.match(row[0])
if match:
comments.append(row[0])
# comments.append('\n')
path = 'C:\\Users\\chend\\PycharmProjects\\nju\\'
os.chdir(path)
with open('comments.txt','w+',encoding='utf-8') as f:
f.writelines(comments)
del comments
except Exception as err:
print(err)
finally:
conn.close()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
def analysisWorld():
'''
分析高频的单词,制作词云
'''
path='C:\\Users\\chend\\PycharmProjects\\nju\\'
os.chdir(path)
with open('comments.txt', 'r+', encoding='utf-8') as f:
text=f.readlines()
segslist=[]
for sentense in text:
seq_list = jieba.cut(sentense, cut_all=True)
spliitsentense=' '.join(seq_list)
segslist.append(spliitsentense)
return segslist
def cloudWord(llstr):
    '''
    #词云图形展示
    :return:
    '''
    path='C:\\Windows\\Fonts\\simkai.ttf'
    wordcloud = WordCloud(background_color="white",font_path=path,width=1000, height=860, margin=2).generate(llstr)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

```

main函数主要做测试用

def main():

# webCrawler()
# localAnalysis()
# storeDB(UsersPages)
# user=User('10287387')
# result=Opdatabase('select',user)
# print(result.getuserid(),result.getusername(), result.getuserhref(), result.getusercomment())
# print('----------------------------------------------------------------------')
# user1= User('220184863', 'Adminchendong', 'http://ahpuchend.github.io.com', 'hello world')
# Opdatabase('insert', user1)
# print('after insert')
# print(user1.getusercomment())
# result1 = Opdatabase('select', user1)
# print(result1.getuserid(), result1.getusername(), result1.getuserhref(), result1.getusercomment())
# r=Opdatabase('delete', User('220194863'))
# print(r)
# user1 = User('1012070166')
# result1 = Opdatabase('select', user1)
# print(result1.getuserid(), result1.getusername(), result1.getuserhref(), result1.getusercomment())
# user2 = User('1012070166')
# op=Opdatabase('update',user2)
# print(op)
# analysisWorld()
getAllComments()
allcontent=analysisWorld()
txt=str(allcontent).strip()
cloudWord(txt)

main()
、、、