Wednesday, July 15, 2015

Python: data structure (2)


Abstract: hunt the top K among a population

We implement heap to store top K elements using the module heapq. The first step is the process of query of frequency of all words in a text, and the second step is implement heap to hunt the top K words.

The result :
[(98, 'the'), (43, 'to'), (36, 'in'), (36, 'and'), (30, 'of'), (25, 'economist'), (22, 'is'), (22, 'a'), (21, 'for'), (17, 'she'), (16, 'more')]

The script:

# -*- coding: utf-8 -*-
"""
Created on Mon Jul 13 17:42:38 2015

@author: yuan
"""

#hunt top 10 words in a file
import re
import string
import heapq

class top_K:
    def __init__(self, number):
        self.topK=number
       
    def read_file(self,file):
        words_dict={}
        for line in open(file):
            line=line.rstrip()  # remove return
            line=line.lower()  #all lower case
            line=re.sub("\'s|s\'", "", line)
            line=line.translate(None, "\\\/,.?\"<>:\'\"\“()")  #remove character
            parts=line.split(" ")
            for part in parts:
                if re.search("[A-Z]|[a-z]", part) and not re.search('([0-9])', part):
                    if part in words_dict.keys():
                        words_dict[part] +=1
                    else:
                        words_dict[part]=1
        return words_dict
       
    def print_dict(self, dictionary):
        for key in dictionary.keys():
            print key, '=', dictionary[key]
           
    def heap_loop(self, dictionary):
        #construc a heap
        heap=[]
        for key,value in dictionary.iteritems():
            #print key, value
            if len(heap)<=self.topK:
                heapq.heappush(heap, (value, key))
            else:
                first_node=heap[0]
                first_frq=first_node[0]
                if value>first_frq:
                    print value, first_frq
                    heapq.heappushpop(heap, (value, key))
        heap.sort(reverse=True)
        return heap
               
       
#main program
K=top_K(10)

#read text file and query frequency per word
words_dict=K.read_file('/home/yuan/mysql_pre/cookbooks/paper.txt')
#K.print_dict(words_dict)

#get top 10 words
top_words=K.heap_loop(words_dict)
print top_words

No comments:

Post a Comment