vine数据集处理2

需求

现在拥有了一一对应的id::::text(video_text_hash.txt)和id::::url(video_url_hash.txt)。需求是永远递增的嘛~~~
将video_text_hash.txt中text内容的#hash拿出来,单独存成id::::text(id_text.txt)和id::::hash(id_hash.txt)
PS:拿完#hash后剩下的text若为空则删除此条目

handle_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import re
#根据video_text_hash.txt将hash提取出来获得id_hash.txt和剩下的id_text.txt其中排除剩余为空的id_text_void.txt
f_video_texts = open("video_text_hash.txt",encoding='utf8', errors='ignore')
f_text = open("id_text.txt",'w',encoding='utf8', errors='ignore')
f_text_void = open("id_text_void.txt",'w',encoding='utf8', errors='ignore')
f_hash = open("id_hash.txt",'w',encoding='utf8', errors='ignore')

def get_hashs(template): # HashTag以‘#’开头,以空格或回车结尾
copy = False
finished = False
slotList = []
str = ""
for s in template:
if s == '#':
copy = True
elif s == ' ':
copy = False
finished = True
elif s == '\n':
copy = False
finished = True
elif copy:
str = str + s
if finished:
if str != "":
slotList.append(str)
str = ""
finished = False
return slotList

def getText(template):
rule = r'::::(.*?)\n'
slotList = re.findall(rule, template)
return slotList

def getId(template):
rule = r'(.*?)::::'
slotList = re.findall(rule, template)
return slotList

# for i in range(1,20):
ii=0
while 1:
line_video_text = f_video_texts.readline()
list_hash = []
if not line_video_text:
break
else:
try:
list_hash = get_hashs(line_video_text) #hash组成一个list
list_text = getText(line_video_text) #text组成list
str_text = str(list_text[0])+'\n'
str_hashs = ''
for hash in list_hash: #遍历每句的每个hash
str_hash1 = '#'+str(hash)+' '
str_hashs += str(str_hash1)
str_hash2 = '#' + str(hash) + '\n' #还原hash
str_text = str_text.replace(str_hash1,"") #删除hash
str_text = str_text.replace(str_hash2, "\n")
except IndexError:
print("error")
if(str_text != '\n'):
f_text.write(str(getId(line_video_text)[0])+'::::'+str_text)
f_hash.write(str(getId(line_video_text)[0])+'::::'+str_hashs+'\n')
else:
f_text_void.write(str(getId(line_video_text)[0])+'\n')
ii+=1
if(ii%10000==0):
print(ii)

f_video_texts.close()
f_text.close()
f_text_void.close()
f_hash.close()
text处理

对于id_text.txt里的text,还需要做以下操作,1、去除非英文字符。2、去除停顿词。3、小写化text。
得到id_text_final.txt和被剔除的剩余为空的id存到id_text_final_void.txt。
同时顺便得到清理后的id::::hashs文件id_hash_final.txt(顺便小写化)

handle_text_final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import re
#根据id_text.txt做文本处理获得id_text_final.txt和排除剩余为空的id_text_final_void.txt
#顺便得到清理后的id::::hashs文件id_hash_final.txt
f_video_texts = open("id_text.txt",encoding='utf8', errors='ignore')
f_stop_word = open("stop_word.txt",encoding='utf8', errors='ignore')
f_id_hash = open("id_hash.txt",encoding='utf8', errors='ignore')
f_id_text_final = open("id_text_final.txt",'w',encoding='utf8', errors='ignore')
f_id_text_final_void = open("id_text_final_void.txt",'w',encoding='utf8', errors='ignore')
f_id_hash_final = open("id_hash_final.txt",'w',encoding='utf8', errors='ignore')

def getText(template):
rule = r'::::(.*?)\n'
slotList = re.findall(rule, template)
return slotList

def getId(template):
rule = r'(.*?)::::'
slotList = re.findall(rule, template)
return slotList

#解决.isalpha()方法对(unicode string,string.isalpha会根据字符串中的字符是否属于Unicode编码的LETTER区域来判断是否都由字母组成。
# 所以得出的结果为True,不一定表示只有26个英文字母。)
def isAlpha(word):
try:
return word.encode('ascii').isalpha()
except UnicodeEncodeError:
return False

list_stop_word = []
while 1:
line_stop_word = f_stop_word.readline()
if not line_stop_word:
break
else:
list_stop_word.append(str(line_stop_word).replace("\n",'').lower())
print("停用词列表生成成功")

dict_hash = {}
while 1:
line_hash = f_id_hash.readline()
if not line_hash:
break
else:
dict_hash[getId(line_hash)[0]] = getText(line_hash)[0]
print("hash字典生成成功")

ii=0
# for i in range(1,3):
while 1:
line_video_text = f_video_texts.readline()
list_word = []
if not line_video_text:
break
else:
try:
list_text = str(getText(line_video_text)[0])
text_final = ''
for s in list_text:
if isAlpha(s):
text_final += s
elif s==' ':
text_final += s #text字符串,包含多余空格
# print(text_final)
list_word = text_final.split(' ') #分词,包含空字符
text_final = ''
for word in list_word:
if word == '':
continue
elif word not in list_stop_word:
text_final += word+' '
if text_final != '':
str_hash = dict_hash[getId(line_video_text)[0]] #清除非全英文字母的hash
rule = r'#(.*?) '
list_hashs = re.findall(rule, str_hash)
str_hash = ''
for hash in list_hashs:
flag = 1
for s in hash:
if isAlpha(s):
continue
else:
flag = 0
break
if flag:
str_hash += '#'+hash+' '
if str_hash != '':
f_id_text_final.write(getId(line_video_text)[0]+'::::'+text_final+'\n') #将去除空结果的id::::text写入id_text_final.txt
f_id_hash_final.write(getId(line_video_text)[0]+'::::'+str_hash+'\n')
else:
f_id_text_final_void.write(getId(line_video_text)[0]+'\n') #将空白结果id写入text_final_void.txt
except IndexError:
print("error")
ii+=1
if(ii%10000==0):
print(ii)
# print(len(list_1)) #data1的id list

f_video_texts.close()
f_stop_word.close()
f_id_hash.close()
f_id_text_final.close()
f_id_text_final_void.close()
f_id_hash_final.close()
统计hash词频

根据id_hash_final.txt做hash的词频统计得到count_hash.txt

count.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# 统计hash总数与词频
import collections
import re
f_id_hash_final = open("id_hash_final.txt", encoding='utf8', errors='ignore')
f_count_hash = open("count_hash.txt", 'w', encoding='utf8', errors='ignore')

def subString1(template): # HashTag以‘#’开头,以空格或回车结尾
copy = False
finished = False
slotList = []
str = ""
for s in template:
if s == '#':
copy = True
elif s == ' ':
copy = False
finished = True
elif copy:
str = str + s
if finished:
if str != "":
slotList.append(str)
str = ""
finished = False
return slotList

slotList = []
i=1
# for i in range(0,5):
while 1:
line_id_hash = f_id_hash_final.readline()
if not line_id_hash:
break
else:
slotList += subString1(line_id_hash) #将所有HashTag存进list
# print(slotList)

result = collections.Counter(slotList) #词频统计
#ss = str(result) #词频统计结果转换成字符串并存文件
# f_handle_hash.write(ss)
#print(result.most_common(100)) 查看统计结果前100名
for each in result.most_common():
ss = str(each) #词频统计结果转换成字符串并存文件
f_count_hash.write(ss+'\n')
#print(ss)
f_id_hash_final.close()
f_count_hash.close()
截取指定长度数据集

遍历id_hash_final.txt里的hash,走到最后一个频率为5的hash(premiosjuventud)停止
首先生成对应的list与dict以供快速读取

get_list_or_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import re
#id_text_final-->dict_id_text_final
#id_hash_final-->dict_id_hash_final
#count_hash-->list_count_hash
#video_url_hash-->dict_id_url
f_id_text_final = open("id_text_final.txt",encoding='utf8', errors='ignore')
f_dict_id_text_final = open("dict_id_text_final.txt",'w',encoding='utf8', errors='ignore')
f_id_hash_final = open("id_hash_final.txt",encoding='utf8', errors='ignore')
f_dict_id_hash_final = open("dict_id_hash_final.txt",'w',encoding='utf8', errors='ignore')
f_count_hash = open("count_hash.txt",encoding='utf8', errors='ignore')
f_list_count_hash = open("list_count_hash.txt",'w',encoding='utf8', errors='ignore')
f_video_url_hash = open("video_url_hash.txt",encoding='utf8', errors='ignore')
f_dict_id_url = open("dict_id_url.txt",'w',encoding='utf8', errors='ignore')

def getText(template):
rule = r'::::(.*?)\n'
slotList = re.findall(rule, template)
return slotList

def getId(template):
rule = r'(.*?)::::'
slotList = re.findall(rule, template)
return slotList

list_hash = []
while 1:
line_hash = f_count_hash.readline()
if not line_hash:
break
else:
rule1 = r"'(.*?)'"
rule2 = r'"(.*?)"'
try:
slotList = re.findall(rule1, line_hash)
list_hash.append(str(slotList[0]))
except IndexError:
slotList = re.findall(rule2, line_hash)
list_hash.append(str(slotList[0]))
f_list_count_hash.write(str(list_hash))
# print(len(list_1)) #data1的id list

dict_id_text_final = {}
dict_id_hash_final = {}
ii=1
while 1:
line_id_text = f_id_text_final.readline()
line_id_hash = f_id_hash_final.readline()
if not line_id_text:
break
else:
each_id = getId(line_id_text)
each_text = getText(line_id_text)
dict_id_text_final[each_id[0]] = each_text[0]
each_id = getId(line_id_hash)
each_text = getText(line_id_hash)
dict_id_hash_final[each_id[0]] = each_text[0]
ii += 1
if (ii % 50000 == 0):
print(ii)
f_dict_id_text_final.write(str(dict_id_text_final))
f_dict_id_hash_final.write(str(dict_id_hash_final))
# print(dict_id_text_final['980744748887429120']) #data2的dict {id:text}
# print(dict_id_hash_final['980744748887429120'])

dict_id_url = {}
ii=1
while 1:
line_id_url = f_video_url_hash.readline()
if not line_id_url:
break
else:
each_id = getId(line_id_url)
each_text = getText(line_id_url)
dict_id_url[each_id[0]] = each_text[0]
ii += 1
if (ii % 50000 == 0):
print(ii)
f_dict_id_url.write(str(dict_id_url))
# print(dict_id_url['980744748887429120']) #data2的dict {id:text}

f_id_text_final.close()
f_dict_id_text_final.close()
f_id_hash_final.close()
f_dict_id_hash_final.close()
f_count_hash.close()
f_list_count_hash.close()
f_video_url_hash.close()
f_dict_id_url.close()

最后得到一定数量的数据集final_id_texts/hashs/urls.txt

get_final_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import re
#根据count_hash.txt和id_hash_final.txt获得最终需要的一定数量的数据集final_id_texts/hashs/urls.txt
f_dict_id_text_final = open("dict_id_text_final.txt",encoding='utf8', errors='ignore')
f_dict_id_hash_final = open("dict_id_hash_final.txt",encoding='utf8', errors='ignore')
f_list_count_hash = open("list_count_hash.txt",encoding='utf8', errors='ignore')
f_dict_id_url = open("dict_id_url.txt",encoding='utf8', errors='ignore')
f_final_texts = open("final_texts.txt",'w',encoding='utf8', errors='ignore')
f_final_hashs = open("final_hashs.txt",'w',encoding='utf8', errors='ignore')
f_final_urls = open("final_urls.txt",'w',encoding='utf8', errors='ignore')

def getText(template):
rule = r'::::(.*?)\n'
slotList = re.findall(rule, template)
return slotList

def getId(template):
rule = r'(.*?)::::'
slotList = re.findall(rule, template)
return slotList

list_hash = []
f = open("list_count_hash.txt",encoding='utf8', errors='ignore')
a = f.read()
list_hash = eval(a)
f.close()
print("list_count_hash读取成功\n")
print(len(list_hash))

dict_id_text_final = {}
f = open("dict_id_text_final.txt",encoding='utf8', errors='ignore')
a = f.read()
dict_id_text_final = eval(a)
f.close()
print("dict_id_text_final读取成功\n")

dict_id_hash_final = {}
f = open("dict_id_hash_final.txt",encoding='utf8', errors='ignore')
a = f.read()
dict_id_hash_final = eval(a)
f.close()
print("dict_id_hash_final读取成功\n")

dict_id_url = {}
f = open("dict_id_url.txt",encoding='utf8', errors='ignore')
a = f.read()
dict_id_url = eval(a)
f.close()
print("dict_id_url读取成功\n")

print("开始匹配")
list_del = []
iii=0
for hash in list_hash:
ii = 0
for key in dict_id_hash_final:
if (str("#"+hash+" ") in str(dict_id_hash_final[key])):
f_final_texts.write(key + '::::' + dict_id_text_final[key] + '\n')
f_final_hashs.write(key + '::::' + dict_id_hash_final[key] + '\n')
f_final_urls.write(key + '::::' + dict_id_url[key] + '\n')
ii+=1
list_del.append(key)
for i_del in list_del:
del dict_id_hash_final[i_del]
list_del = []
iii+=ii
print(str(hash)+"--匹配结束,其写入--"+str(ii)+'--条--共计写入'+str(iii)+'条')
if (str(hash) == 'premiosjuventud'):
break

f_dict_id_text_final.close()
f_dict_id_hash_final.close()
f_list_count_hash.close()
f_dict_id_url.close()
f_final_texts.close()
f_final_hashs.close()
f_final_urls.close()
再处理hash

将final_hashs.txt复制到final_hashs1.txt将每条hashs中出现小于5次的hash删除得到最终final_hashs.txt

clean_final_hashs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#对于final_hashs1.txt每条item清除频率少于5次的hash
import re
f_final_hashs1 = open("final_hashs1.txt", encoding='utf8', errors='ignore')
f_list_hash = open("count_hash.txt", encoding='utf8', errors='ignore')
f_final_hashs = open("final_hashs.txt", 'w', encoding='utf8', errors='ignore')

def getText(template):
rule = r'::::(.*?)\n'
slotList = re.findall(rule, template)
return slotList

def getId(template):
rule = r'(.*?)::::'
slotList = re.findall(rule, template)
return slotList

list_hash = []
# for i in range(0,5):
while 1:
line_id_hash = f_list_hash.readline()
if not line_id_hash:
break
else:
rule = r"'(.*?)'"
list_hash += re.findall(rule, line_id_hash)
print('list_hash生成成功')

# for i in range(0,100):
i=0
while 1:
line_hashs = f_final_hashs1.readline()
list_hashs = []
str_hashs = ''
if not line_hashs:
break
else:
try:
# str_hashs = str(getId(line_hashs)[0]) + '::::'
rule = r"#(.*?) "
list_hashs += re.findall(rule, line_hashs)
for hash in list_hashs:
if str(hash) in list_hash:
str_hashs += '#' + str(hash) + ' '
if(str_hashs != ''):
f_final_hashs.write(str(getId(line_hashs)[0]) + '::::'+str_hashs + '\n')
else:
print('------')
i+=1
except IndexError:
print("erro")
if(i%10000==0):
print(i)

f_final_hashs1.close()
f_list_hash.close()
f_final_hashs.close()
小结

到目前为止,获得了121117条一一对应的final_hashs/urls/texts.txt文件
数据集与代码,发布在github.