vine数据集处理2

需求

现在拥有了一一对应的id::::text（video_text_hash.txt）和id::::url（video_url_hash.txt）。需求是永远递增的嘛~~~
将video_text_hash.txt中text内容的#hash拿出来，单独存成id::::text（id_text.txt）和id::::hash（id_hash.txt）
PS:拿完#hash后剩下的text若为空则删除此条目

handle_text.py

import re
#根据video_text_hash.txt将hash提取出来获得id_hash.txt和剩下的id_text.txt其中排除剩余为空的id_text_void.txt
f_video_texts = open("video_text_hash.txt",encoding='utf8', errors='ignore')
f_text = open("id_text.txt",'w',encoding='utf8', errors='ignore')
f_text_void = open("id_text_void.txt",'w',encoding='utf8', errors='ignore')
f_hash = open("id_hash.txt",'w',encoding='utf8', errors='ignore')

def get_hashs(template):  # HashTag以‘#’开头，以空格或回车结尾
    copy = False
    finished = False
    slotList = []
    str = ""
    for s in template:
        if s == '#':
            copy = True
        elif s == ' ':
            copy = False
            finished = True
        elif s == '\n':
            copy = False
            finished = True
        elif copy:
            str = str + s
        if finished:
            if str != "":
                slotList.append(str)
            str = ""
            finished = False
    return slotList

def getText(template):
    rule = r'::::(.*?)\n'
    slotList = re.findall(rule, template)
    return slotList

def getId(template):
    rule = r'(.*?)::::'
    slotList = re.findall(rule, template)
    return slotList

# for i in range(1,20):
ii=0
while 1:
    line_video_text = f_video_texts.readline()
    list_hash = []
    if not line_video_text:
        break
    else:
        try:
            list_hash = get_hashs(line_video_text) 	#hash组成一个list
            list_text = getText(line_video_text) 		#text组成list
            str_text = str(list_text[0])+'\n'
            str_hashs = ''
            for hash in list_hash: 		#遍历每句的每个hash
                str_hash1 = '#'+str(hash)+' '
                str_hashs += str(str_hash1)
                str_hash2 = '#' + str(hash) + '\n' 		#还原hash
                str_text = str_text.replace(str_hash1,"") 		#删除hash
                str_text = str_text.replace(str_hash2, "\n")
        except IndexError:
            print("error")
        if(str_text != '\n'):
            f_text.write(str(getId(line_video_text)[0])+'::::'+str_text)
            f_hash.write(str(getId(line_video_text)[0])+'::::'+str_hashs+'\n')
        else:
            f_text_void.write(str(getId(line_video_text)[0])+'\n')
    ii+=1
    if(ii%10000==0):
        print(ii)

f_video_texts.close()
f_text.close()
f_text_void.close()
f_hash.close()

text处理

对于id_text.txt里的text，还需要做以下操作，1、去除非英文字符。2、去除停顿词。3、小写化text。
得到id_text_final.txt和被剔除的剩余为空的id存到id_text_final_void.txt。
同时顺便得到清理后的id::::hashs文件id_hash_final.txt（顺便小写化）

handle_text_final.py

import re
#根据id_text.txt做文本处理获得id_text_final.txt和排除剩余为空的id_text_final_void.txt
#顺便得到清理后的id::::hashs文件id_hash_final.txt
f_video_texts = open("id_text.txt",encoding='utf8', errors='ignore')
f_stop_word = open("stop_word.txt",encoding='utf8', errors='ignore')
f_id_hash = open("id_hash.txt",encoding='utf8', errors='ignore')
f_id_text_final = open("id_text_final.txt",'w',encoding='utf8', errors='ignore')
f_id_text_final_void = open("id_text_final_void.txt",'w',encoding='utf8', errors='ignore')
f_id_hash_final = open("id_hash_final.txt",'w',encoding='utf8', errors='ignore')

def getText(template):
    rule = r'::::(.*?)\n'
    slotList = re.findall(rule, template)
    return slotList

def getId(template):
    rule = r'(.*?)::::'
    slotList = re.findall(rule, template)
    return slotList

#解决.isalpha()方法对（unicode string，string.isalpha会根据字符串中的字符是否属于Unicode编码的LETTER区域来判断是否都由字母组成。
# 所以得出的结果为True，不一定表示只有26个英文字母。）
def isAlpha(word):
    try:
        return word.encode('ascii').isalpha()
    except UnicodeEncodeError:
        return False

list_stop_word = []
while 1:
    line_stop_word = f_stop_word.readline()
    if not line_stop_word:
        break
    else:
        list_stop_word.append(str(line_stop_word).replace("\n",'').lower())
print("停用词列表生成成功")

dict_hash = {}
while 1:
    line_hash = f_id_hash.readline()
    if not line_hash:
        break
    else:
        dict_hash[getId(line_hash)[0]] = getText(line_hash)[0]
print("hash字典生成成功")

ii=0
# for i in range(1,3):
while 1:
    line_video_text = f_video_texts.readline()
    list_word = []
    if not line_video_text:
        break
    else:
        try:
            list_text = str(getText(line_video_text)[0])
            text_final = ''
            for s in list_text:
                if isAlpha(s):
                    text_final += s
                elif s==' ':
                    text_final += s    #text字符串，包含多余空格
            # print(text_final)
            list_word = text_final.split(' ')   #分词，包含空字符
            text_final = ''
            for word in list_word:
                if word == '':
                    continue
                elif word not in list_stop_word:
                    text_final += word+' '
            if text_final != '':
                str_hash = dict_hash[getId(line_video_text)[0]]     #清除非全英文字母的hash
                rule = r'#(.*?) '
                list_hashs = re.findall(rule, str_hash)
                str_hash = ''
                for hash in list_hashs:
                    flag = 1
                    for s in hash:
                        if isAlpha(s):
                            continue
                        else:
                            flag = 0
                            break
                    if flag:
                        str_hash += '#'+hash+' '
                if str_hash != '':
                    f_id_text_final.write(getId(line_video_text)[0]+'::::'+text_final+'\n')     #将去除空结果的id::::text写入id_text_final.txt
                    f_id_hash_final.write(getId(line_video_text)[0]+'::::'+str_hash+'\n')
            else:
                f_id_text_final_void.write(getId(line_video_text)[0]+'\n')      #将空白结果id写入text_final_void.txt
        except IndexError:
            print("error")
    ii+=1
    if(ii%10000==0):
        print(ii)
# print(len(list_1))      #data1的id  list

f_video_texts.close()
f_stop_word.close()
f_id_hash.close()
f_id_text_final.close()
f_id_text_final_void.close()
f_id_hash_final.close()

统计hash词频

根据id_hash_final.txt做hash的词频统计得到count_hash.txt

count.py

# 统计hash总数与词频
import collections
import re
f_id_hash_final = open("id_hash_final.txt", encoding='utf8', errors='ignore')
f_count_hash = open("count_hash.txt", 'w', encoding='utf8', errors='ignore')

def subString1(template):  # HashTag以‘#’开头，以空格或回车结尾
    copy = False
    finished = False
    slotList = []
    str = ""
    for s in template:
        if s == '#':
            copy = True
        elif s == ' ':
            copy = False
            finished = True
        elif copy:
            str = str + s
        if finished:
            if str != "":
                slotList.append(str)
            str = ""
            finished = False
    return slotList

slotList = []
i=1
# for i in range(0,5):
while 1:
    line_id_hash = f_id_hash_final.readline()
    if not line_id_hash:
        break
    else:
        slotList += subString1(line_id_hash) #将所有HashTag存进list
# print(slotList)

result = collections.Counter(slotList)  #词频统计
#ss = str(result)    #词频统计结果转换成字符串并存文件
# f_handle_hash.write(ss)
#print(result.most_common(100)) 查看统计结果前100名
for each in result.most_common():
    ss = str(each)    #词频统计结果转换成字符串并存文件
    f_count_hash.write(ss+'\n')
    #print(ss)
f_id_hash_final.close()
f_count_hash.close()

截取指定长度数据集

遍历id_hash_final.txt里的hash，走到最后一个频率为5的hash（premiosjuventud）停止
首先生成对应的list与dict以供快速读取

get_list_or_dict.py

import re
#id_text_final-->dict_id_text_final
#id_hash_final-->dict_id_hash_final
#count_hash-->list_count_hash
#video_url_hash-->dict_id_url
f_id_text_final = open("id_text_final.txt",encoding='utf8', errors='ignore')
f_dict_id_text_final = open("dict_id_text_final.txt",'w',encoding='utf8', errors='ignore')
f_id_hash_final = open("id_hash_final.txt",encoding='utf8', errors='ignore')
f_dict_id_hash_final = open("dict_id_hash_final.txt",'w',encoding='utf8', errors='ignore')
f_count_hash = open("count_hash.txt",encoding='utf8', errors='ignore')
f_list_count_hash = open("list_count_hash.txt",'w',encoding='utf8', errors='ignore')
f_video_url_hash = open("video_url_hash.txt",encoding='utf8', errors='ignore')
f_dict_id_url = open("dict_id_url.txt",'w',encoding='utf8', errors='ignore')

def getText(template):
    rule = r'::::(.*?)\n'
    slotList = re.findall(rule, template)
    return slotList

def getId(template):
    rule = r'(.*?)::::'
    slotList = re.findall(rule, template)
    return slotList

list_hash = []
while 1:
    line_hash = f_count_hash.readline()
    if not line_hash:
        break
    else:
        rule1 = r"'(.*?)'"
        rule2 = r'"(.*?)"'
        try:
            slotList = re.findall(rule1, line_hash)
            list_hash.append(str(slotList[0]))
        except IndexError:
            slotList = re.findall(rule2, line_hash)
            list_hash.append(str(slotList[0]))
f_list_count_hash.write(str(list_hash))
# print(len(list_1))      #data1的id  list

dict_id_text_final = {}
dict_id_hash_final = {}
ii=1
while 1:
    line_id_text = f_id_text_final.readline()
    line_id_hash = f_id_hash_final.readline()
    if not line_id_text:
        break
    else:
        each_id = getId(line_id_text)
        each_text = getText(line_id_text)
        dict_id_text_final[each_id[0]] = each_text[0]
        each_id = getId(line_id_hash)
        each_text = getText(line_id_hash)
        dict_id_hash_final[each_id[0]] = each_text[0]
    ii += 1
    if (ii % 50000 == 0):
        print(ii)
f_dict_id_text_final.write(str(dict_id_text_final))
f_dict_id_hash_final.write(str(dict_id_hash_final))
# print(dict_id_text_final['980744748887429120'])     #data2的dict {id:text}
# print(dict_id_hash_final['980744748887429120'])

dict_id_url = {}
ii=1
while 1:
    line_id_url = f_video_url_hash.readline()
    if not line_id_url:
        break
    else:
        each_id = getId(line_id_url)
        each_text = getText(line_id_url)
        dict_id_url[each_id[0]] = each_text[0]
    ii += 1
    if (ii % 50000 == 0):
        print(ii)
f_dict_id_url.write(str(dict_id_url))
# print(dict_id_url['980744748887429120'])     #data2的dict {id:text}

f_id_text_final.close()
f_dict_id_text_final.close()
f_id_hash_final.close()
f_dict_id_hash_final.close()
f_count_hash.close()
f_list_count_hash.close()
f_video_url_hash.close()
f_dict_id_url.close()

最后得到一定数量的数据集final_id_texts/hashs/urls.txt

get_final_urls.py

import re
#根据count_hash.txt和id_hash_final.txt获得最终需要的一定数量的数据集final_id_texts/hashs/urls.txt
f_dict_id_text_final = open("dict_id_text_final.txt",encoding='utf8', errors='ignore')
f_dict_id_hash_final = open("dict_id_hash_final.txt",encoding='utf8', errors='ignore')
f_list_count_hash = open("list_count_hash.txt",encoding='utf8', errors='ignore')
f_dict_id_url = open("dict_id_url.txt",encoding='utf8', errors='ignore')
f_final_texts = open("final_texts.txt",'w',encoding='utf8', errors='ignore')
f_final_hashs = open("final_hashs.txt",'w',encoding='utf8', errors='ignore')
f_final_urls = open("final_urls.txt",'w',encoding='utf8', errors='ignore')

def getText(template):
    rule = r'::::(.*?)\n'
    slotList = re.findall(rule, template)
    return slotList

def getId(template):
    rule = r'(.*?)::::'
    slotList = re.findall(rule, template)
    return slotList

list_hash = []
f = open("list_count_hash.txt",encoding='utf8', errors='ignore')
a = f.read()
list_hash = eval(a)
f.close()
print("list_count_hash读取成功\n")
print(len(list_hash))

dict_id_text_final = {}
f = open("dict_id_text_final.txt",encoding='utf8', errors='ignore')
a = f.read()
dict_id_text_final = eval(a)
f.close()
print("dict_id_text_final读取成功\n")

dict_id_hash_final = {}
f = open("dict_id_hash_final.txt",encoding='utf8', errors='ignore')
a = f.read()
dict_id_hash_final = eval(a)
f.close()
print("dict_id_hash_final读取成功\n")

dict_id_url = {}
f = open("dict_id_url.txt",encoding='utf8', errors='ignore')
a = f.read()
dict_id_url = eval(a)
f.close()
print("dict_id_url读取成功\n")

print("开始匹配")
list_del = []
iii=0
for hash in list_hash:
    ii = 0
    for key in dict_id_hash_final:
        if (str("#"+hash+" ") in str(dict_id_hash_final[key])):
            f_final_texts.write(key + '::::' + dict_id_text_final[key] + '\n')
            f_final_hashs.write(key + '::::' + dict_id_hash_final[key] + '\n')
            f_final_urls.write(key + '::::' + dict_id_url[key] + '\n')
            ii+=1
            list_del.append(key)
    for i_del in list_del:
        del dict_id_hash_final[i_del]
    list_del = []
    iii+=ii
    print(str(hash)+"--匹配结束，其写入--"+str(ii)+'--条--共计写入'+str(iii)+'条')
    if (str(hash) == 'premiosjuventud'):
        break

f_dict_id_text_final.close()
f_dict_id_hash_final.close()
f_list_count_hash.close()
f_dict_id_url.close()
f_final_texts.close()
f_final_hashs.close()
f_final_urls.close()

再处理hash

将final_hashs.txt复制到final_hashs1.txt将每条hashs中出现小于5次的hash删除得到最终final_hashs.txt

clean_final_hashs.py

#对于final_hashs1.txt每条item清除频率少于5次的hash
import re
f_final_hashs1 = open("final_hashs1.txt", encoding='utf8', errors='ignore')
f_list_hash = open("count_hash.txt", encoding='utf8', errors='ignore')
f_final_hashs = open("final_hashs.txt", 'w', encoding='utf8', errors='ignore')

def getText(template):
    rule = r'::::(.*?)\n'
    slotList = re.findall(rule, template)
    return slotList

def getId(template):
    rule = r'(.*?)::::'
    slotList = re.findall(rule, template)
    return slotList

list_hash = []
# for i in range(0,5):
while 1:
    line_id_hash = f_list_hash.readline()
    if not line_id_hash:
        break
    else:
        rule = r"'(.*?)'"
        list_hash += re.findall(rule, line_id_hash)
print('list_hash生成成功')

# for i in range(0,100):
i=0
while 1:
    line_hashs = f_final_hashs1.readline()
    list_hashs = []
    str_hashs = ''
    if not line_hashs:
        break
    else:
        try:
            # str_hashs = str(getId(line_hashs)[0]) + '::::'
            rule = r"#(.*?) "
            list_hashs += re.findall(rule, line_hashs)
            for hash in list_hashs:
                if str(hash) in list_hash:
                    str_hashs += '#' + str(hash) + ' '
            if(str_hashs != ''):
                f_final_hashs.write(str(getId(line_hashs)[0]) + '::::'+str_hashs + '\n')
            else:
                print('------')
            i+=1
        except IndexError:
            print("erro")
    if(i%10000==0):
        print(i)

f_final_hashs1.close()
f_list_hash.close()
f_final_hashs.close()

小结

到目前为止，获得了121117条一一对应的final_hashs/urls/texts.txt文件
数据集与代码，发布在github.