Wednesday, July 15, 2015
Python: data structure (2)
Abstract: hunt the top K among a population
We implement heap to store top K elements using the module heapq. The first step is the process of query of frequency of all words in a text, and the second step is implement heap to hunt the top K words.
The result :
[(98, 'the'), (43, 'to'), (36, 'in'), (36, 'and'), (30, 'of'), (25, 'economist'), (22, 'is'), (22, 'a'), (21, 'for'), (17, 'she'), (16, 'more')]
The script:
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 13 17:42:38 2015
@author: yuan
"""
#hunt top 10 words in a file
import re
import string
import heapq
class top_K:
def __init__(self, number):
self.topK=number
def read_file(self,file):
words_dict={}
for line in open(file):
line=line.rstrip() # remove return
line=line.lower() #all lower case
line=re.sub("\'s|s\'", "", line)
line=line.translate(None, "\\\/,.?\"<>:\'\"\“()") #remove character
parts=line.split(" ")
for part in parts:
if re.search("[A-Z]|[a-z]", part) and not re.search('([0-9])', part):
if part in words_dict.keys():
words_dict[part] +=1
else:
words_dict[part]=1
return words_dict
def print_dict(self, dictionary):
for key in dictionary.keys():
print key, '=', dictionary[key]
def heap_loop(self, dictionary):
#construc a heap
heap=[]
for key,value in dictionary.iteritems():
#print key, value
if len(heap)<=self.topK:
heapq.heappush(heap, (value, key))
else:
first_node=heap[0]
first_frq=first_node[0]
if value>first_frq:
print value, first_frq
heapq.heappushpop(heap, (value, key))
heap.sort(reverse=True)
return heap
#main program
K=top_K(10)
#read text file and query frequency per word
words_dict=K.read_file('/home/yuan/mysql_pre/cookbooks/paper.txt')
#K.print_dict(words_dict)
#get top 10 words
top_words=K.heap_loop(words_dict)
print top_words
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment