importsysimportlangidimportosimportnltkfrom nltk.tag.api importTaggerIfrom nltk.internals importfind_file, find_jar, config_java, java, _java_options, find_jars_within_pathimportitertoolsfrom operator importitemgetterfrom nltk.stem importWordNetLemmatizerimportnetworkx as nxfrom nltk.collocations import *
from nltk.stem.porter import *tagger=nltk.tag.perceptron.PerceptronTagger()
wnl=WordNetLemmatizer()
colloc_list=[]
entity_names=[]def filter_for_tags(tagged, tags=['NN', 'NNPS', 'NNP', 'NNS']):return [item for item in tagged if item[1] intags]deffilter_numbers(tagged):return [item for item in tagged if len(item[0]) > 2 and notitem[0].isdigit()]defnormalize(tagged):return [(item[0].replace('.', ''), item[1]) for item intagged]defnormalize_tags(tagged):return [(item[0], item[1][0:1]) for item intagged]deflowercase(tagged):return [(w.lower(), t) for (w, t) intagged]defrstopwords(tagged):return [(w, t) for (w, t) in tagged if not w in nltk.corpus.stopwords.words('english')]deflemmatize(tagged):return [(wnl.lemmatize(item[0]), item[1]) if not ' ' in item[0] else (item[0], item[1]) for item intagged]defextract_entity_names(t):
entity_names=[]if hasattr(t, 'label') andt.label:if t.label() == 'NE':
entity_names.append(' '.join([child[0] for child int]))else:for child int:
entity_names.extend(extract_entity_names(child))returnentity_namesdefjoincolloc(tagged):
if (tagged[i], tagged[i + 1]) incolloc_list:
if tagged[i][1].startswith('NN') or tagged[i + 1][1].startswith('NN'):
tagged1.append(tagged[len(tagged)- 1])returntagged1defgroupne2(tagged):
if (tagged[i][0] + ' ' + tagged[i + 1][0]) inentity_names:
tagged1.append(tagged[len(tagged)- 1])returntagged1defgroupne3(tagged):
sw=0for i in range(len(tagged) - 2):if sw == 1:
if (tagged[i][0] + ' ' + tagged[i + 1][0] + ' ' + tagged[i + 2][0]) inentity_names:
if tagged[i][1] == 'NNP' or tagged[i + 1][1] == 'NNP' or tagged[i + 2][1] == 'NNP':
[0]+ ' ' + tagged[i + 2][0], 'NNP'))elif tagged[i][1] == 'NN' or tagged[i + 1][1] == 'NN' or tagged[i + 2][1] == 'NNP':
[0]+ ' ' + tagged[i + 2][0], 'NN'))elif tagged[i][1] == 'RB' or tagged[i + 1][1] == 'RB' or tagged[i + 2][1] == 'NNP':
[0]+ ' ' + tagged[i + 2][0], 'RB'))else:
[0]+ ' ' + tagged[i + 2][0], tagged[i][1]))else:
tagged1.append(tagged[i])if len(tagged) > 1:
tagged1.append(tagged[len(tagged)- 2])
tagged1.append(tagged[len(tagged)- 1])elif len(tagged) == 1:
tagged1.append(tagged[len(tagged)- 1])returntagged1defjoincollocbi(tagged):
if ' ' intagged[i][0]:
t1= (tagged[i][0][tagged[i][0].find(' '):].strip(), tagged[i][1])else:
t1= (tagged[i][0], tagged[i][1])if ' ' in tagged[i + 1][0]:
t2= (tagged[i + 1][0][:tagged[i + 1]
[0].find(' ')].strip(), tagged[i][1])else:
t2= (tagged[i + 1][0], tagged[i + 1][1])if (t1, t2) incolloc_list:
tagged1.append(tagged[len(tagged)- 1])returntagged1
blacklist=[]
fname= sys.argv[1]
articles=os.listdir(fname)
FOLDER= 'keywords-' + fname + '-textrank'
if notos.path.exists(FOLDER): os.makedirs(FOLDER)
tagged=[]for article inarticles:#articleFile = open(fname+'/' + article, 'r')
articleFile = open(fname + '/' + article, 'r', encoding='UTF-8')for linee inarticleFile:#line = linee.decode('latin-1')
lang=langid.classify(line.strip())if not lang[0] == 'en':continuesentences=nltk.sent_tokenize(line.strip())
tokenized_sentences= [nltk.word_tokenize(sentence) for sentence insentences]
tagged_sentences= [tagger.tag(sentence)for sentence intokenized_sentences]for sentence intagged_sentences:
tagged.extend(sentence)
chunked_sentences= nltk.ne_chunk_sents(tagged_sentences, binary=True)#for tree in chunked_sentences:
#entity_names.extend(extract_entity_names(tree))#entity_names.extend(extract_entity_names(tree))
articleFile.close()#entity_names = set(entity_names)#print(entity_names)
bigram_measures=nltk.collocations.BigramAssocMeasures()
finder=nltk.collocations.BigramCollocationFinder.from_words(tagged)
finder.apply_freq_filter(3)
colloc_list= finder.nbest(bigram_measures.pmi, 20) #this needs to be tweaked
for article inarticles:print('Reading articles/' +article)#articleFile = open(fname + '/' + article, 'r')
articleFile = open(fname + '/' + article, 'r', encoding='UTF-8')
tagged=[]
sentences=[]
k=0for linee inarticleFile:#line = linee.decode('latin-1')
lang=langid.classify(line.strip())if not lang[0] == 'en':continuesents=nltk.sent_tokenize(line.strip())
tok_sents= [nltk.word_tokenize(sent) for sent insents]
tagged_sents= [tagger.tag(sent) for sent intok_sents]
tagged_sents= [joincolloc(sent) for sent intagged_sents]
tagged_sents= [joincollocbi(sent) for sent intagged_sents]
tagged_sents= [groupne2(sent) for sent intagged_sents]
tagged_sents= [groupne3(sent) for sent intagged_sents]
tagged_sents= [filter_for_tags(sent) for sent intagged_sents]
tagged_sents= [normalize_tags(sent) for sent intagged_sents]
tagged_sents= [normalize(sent) for sent intagged_sents]
tagged_sents= [filter_numbers(sent) for sent intagged_sents]
tagged_sents= [lowercase(sent) for sent intagged_sents]
tagged_sents= [lemmatize(sent) for sent intagged_sents]
tagged_sents= [rstopwords(sent) for sent intagged_sents]for sent intagged_sents:
tagged.extend(sent)
sentences.extend(tagged_sents)
gr=nx.MultiGraph()for sentence insentences:if len(sentence) > 1:for i in range(len(sentence) - 1):for j in range(i + 1, len(sentence)):try:
s1= sentence[i][0] + '/' + sentence[i][1]
s2= sentence[j][0] + '/' + sentence[j][1]#wt = float(1.0)/float(len(sentence)) # if weighting by sentence length is desired
wt = 1gr.add_edge(s1, s2, weight=wt)except(AdditionError, e):passH=nx.Graph()for u, v, d in gr.edges(data=True):
w= d['weight']ifH.has_edge(u, v):
H[u][v]['weight'] +=welse:
H.add_edge(u, v, weight=w)
calculated_page_rank=nx.pagerank(H)
keyphraseFile= open(FOLDER + '/'+article, 'w')#di = sorted(calculated_page_rank.iteritems(), key=itemgetter(1), reverse=True)
di = sorted(calculated_page_rank.items(), key=itemgetter(1), reverse=True)
dic=[]#for k, g in itertools.groupby(di, key=itemgetter(1)):#try:
#w = str(map(itemgetter(0), g)[0])#w = w[:w.find('/')]#if len(w)>2 and w not in blacklist:#if w not in dic:#keyphraseFile.write(w.replace(' ','_') + ':' + str(k)[0:6] + ' ')#dic.append(w)#except:#pass
dic =[]for k, g in itertools.groupby(di, key=itemgetter(1)):try:print(k)for item ing:print(item)
w=str(item[0])
w= w[0:w.index('/')]if len(w)>2 and w not inblacklist:#if len(w)>2:
if w not indic:
本文地址:http://syank.xrbh.cn/quote/6624.html 迅博思语资讯 http://syank.xrbh.cn/ , 查看更多