#!/usr/bin/python # Processes a file of the format: # customer identifier \t URL # and determines the most common sequences of 3-URLs hit import re # Maps customer ID to the last few URLs. customerToURLs = {} # Maps the URL-triplets to the number of times we've seen it. tripletCounts = {} # Keeps track of the best N entries. best = [] def insertSortedToBest( key, count ): """ Inserts a key/count pair into a list. Uses the count (in the second slot of the tuple) to determine which position in the list (if any) this item should go, and inserts it. Maintains the list at 10 elements.""" pos = findPositionInBest(count) if count == -1 : return old = [key, count - 1] new = [key, count] best.insert(pos, new) if best.count(old) > 0: best.remove(old) if len(best) == 11: best.remove( best[10] ) def findPositionInBest( count ): """ Determines the appropriate position for an element to be inserted into our best list.""" if len(best) == 0: return 0 i = 0 while i < len(best) and count < best[i][1]: i = i + 1 if i > 9: return -1 else: return i def processEntry( customer, url ): """ Entirely processes a customer/URL pair. """ if customerToURLs.has_key(customer): sequence = customerToURLs[customer] sequence.append(url) # If we have enough elements already, # remove the earliest one from the list if len(sequence) == 4: sequence.pop(0) # If we have enough elements, create a triplet # and increment or set it in the triplet counts if len(sequence) == 3: a, b,c = sequence key = a + ":" + b + ":" + c if tripletCounts.has_key(key): tripletCounts[key] = tripletCounts[key] + 1 insertSortedToBest(key, tripletCounts[key]) else: tripletCounts[key] = 1 insertSortedToBest(key, 1) else: customerToURLs[customer] = [url] r = re.compile(r'\t'); f = open("c:\\data", "r") for line in f: customer, url = r.split(line) processEntry(customer, url) f.close() print "====\nSolution:" print best