twitter爬虫2

需求

利用tweepy和hashtag爬取推文(每个hash拿500条左右)
对推文进行文本处理:去非英文字符、去链接、去hash、去中止词
以hash:twitter形式存字典再存文件:dict_hash_twitter

推文采集

由于国内网络环境问题,还是把每个hash对应的推文保存成文件再处理,稳点起见。
事实证明,用tweepy包的search方法会有访问频率限制,但我申请了四个twitter app
轮回换key还是能实现不间断下载的

get_twitter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import tweepy
from tweepy import OAuthHandler
import re

list_consumer_key = ['########################', '########################', '########################',
'########################']
list_consumer_secret = ['########################',
'########################',
'########################',
'########################']
list_access_token = ['########################-########################',
'########################-########################',
'########################-########################',
'########################-########################']
list_access_secret = ['########################', '########################',
'########################', '########################']

num_key = 0
consumer_key = list_consumer_key[num_key]
consumer_secret = list_consumer_secret[num_key]
access_token = list_access_token[num_key]
access_secret = list_access_secret[num_key]

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

# api = tweepy.API(auth) # 不使用代理
api = tweepy.API(auth, proxy="127.0.0.1:1080")

f_hash = open("count_hash.txt", encoding='utf8', errors='ignore')
list_hash = []
while 1:
line_hash = f_hash.readline()
if not line_hash:
break
else:
list_hash.append(str(re.findall(r"'(.*?)'", line_hash, flags=0)[0]))
print('list_hash生成成功:' + str(len(list_hash)))

for id in range(1056, len(list_hash)):
try:
query = str(list_hash[id])
print(query)
# 获取推文
tweets = tweepy.Cursor(api.search, q=query, count=100, lang='en', include_entities=False).items(300)
f_twitter = open("./hash_twitter/" + str(list_hash[id]), 'w', encoding='utf8', errors='ignore')
i = 0
for tweet in tweets:
f_twitter.write(str(tweet.text).replace('\n', ' ') + '\n')
i += 1
print(i)
if (i < 300):
f_short_hash = open("short_hash.txt", 'a', encoding='utf8', errors='ignore')
f_short_hash.write(str(list_hash[id]) + '\n')
f_short_hash.close()
print(id)
except(TypeError, tweepy.error.TweepError):
print(str(list_hash[id]) + '爬取失败')
f_error_hash = open("error_hash.txt", 'a', encoding='utf8', errors='ignore')
f_error_hash.write(str(list_hash[id]) + '\n')
f_error_hash.close()
# 更换key
num_key += 1
if (num_key == 4):
num_key = 0
consumer_key = list_consumer_key[num_key]
consumer_secret = list_consumer_secret[num_key]
access_token = list_access_token[num_key]
access_secret = list_access_secret[num_key]

auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

api = tweepy.API(auth, proxy="127.0.0.1:1080")
print('key更换成功,继续下载------------------------->')
数据处理

下载好的文件是多行的推文
需要对推文进行文本处理:去非英文字符、去链接、去hash、去中止词
然后以hash为文件名以一行处理后的推文为内容存文件

clean_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import re
import os


def get_hashs(template): # HashTag以‘#’开头,以空格或回车结尾
copy = False
finished = False
slotList = []
str = ""
for s in template:
if s == '#':
copy = True
elif s == ' ':
copy = False
finished = True
elif s == '\n':
copy = False
finished = True
elif copy:
str = str + s
if finished:
if str != "":
slotList.append(str)
str = ""
finished = False
return slotList


def file_name(file_dir): # 获取视频名list
for root, dirs, files in os.walk(file_dir):
continue
return files


def is_alpha(word):
try:
return word.encode('ascii').isalpha()
except UnicodeEncodeError:
return False


f_stop_word = open("stop_word.txt", encoding='utf8', errors='ignore')
list_stop_word = []
while 1:
line_stop_word = f_stop_word.readline()
if not line_stop_word:
break
else:
list_stop_word.append(str(line_stop_word).replace("\n", '').lower())
print("停用词列表生成成功:" + str(len(list_stop_word)))

list_hash_down = file_name(r'I:\推荐系统\数据集\twitter\hash_twitter')
print(len(list_hash_down))

for hash in list_hash_down:
f_twitter = open('./hash_twitter/' + hash, encoding='utf8', errors='ignore')
long_str = ''
# for i in range(1, 10):
while 1:
line_twitter = f_twitter.readline()
if not line_twitter:
break
else:
line_twitter = re.sub(r'#\w* ', '', line_twitter)
line_twitter = re.sub(r'#\w*\n', '', line_twitter)
line_twitter = re.sub(r'https\S* ', '', line_twitter)
line_twitter = re.sub(r'https\S*\n', '', line_twitter)
text_final = ''
for s in line_twitter:
if is_alpha(s):
text_final += s
elif s == ' ' or s == '-' or s == '~':
text_final += ' ' # 去除非英文字符,保留空格
text_final = re.sub(r'\s+', ' ', text_final)
line_twitter = text_final.lower()
list_word = line_twitter.split(' ') # 分词,包含空字符
text_final = ''
for word in list_word:
if word == '':
continue
elif word not in list_stop_word: # 去除中止词
text_final += word + ' '
long_str += text_final
f_twitter.close()
long_str = re.sub(r'\n', '', long_str)
long_str = re.sub(r'\s+', ' ', long_str)
list_word = long_str.split(' ')
if (len(list_word) < 10):
print(str(hash) + '太短')
f_short_result = open('short_result', 'a', encoding='utf8', errors='ignore')
f_short_result.write(str(hash) + '\n')
f_short_result.close()
else:
f_twitter_new = open('./hash_twitter_new/' + hash, 'w', encoding='utf8', errors='ignore')
f_twitter_new.write(long_str)
f_twitter_new.close()