Guys i am writing this program that goes through list of tweets and returns words which was use the most.
I want to make it faster but I wonder if you can help point out some problems or areas which i can improve the speed. thanks
see code below
#import string
import re
from string import punctuation
from operator import itemgetter
import pprint
class Tweet:
def __init__(self, timestamp, userId, message):
self.timestamp = timestamp
self.userId = userId
self.message = message
def getDate(self):
tokens = re.split(' ', self.timestamp)
return tokens[0]
def __repr__(self):
return "[timestamp=%s userId=%s message=%s]" % (self.timestamp, self.userId, self.message)
outfile = file
def readOneTweet(file):
""" Reads a single tweet from the file, and returns the string containing the tweet.
This will often just be a single line from the file, but may be more if it ends with a slash.
"""
lineBuffer = ""
while True:
# TODO: read the line and strip it
rawLine = file.readline().strip('\n')
if (len(rawLine)== 0):
break
lineBuffer +=rawLine
if (rawLine[(len(rawLine)-1)]!= "\\"):
break
return lineBuffer
def readTweets():
tweets = []
inputfile = raw_input("Enter filename: ")
# move the try / except around a single tweet.
# so that we can keep going if we encounter a line with an error.
try:
f = open(inputfile , "r")
while True:
tweet = readOneTweet(f) # readOneTweet is method
if not tweet:
break
try:
lineStrip = tweet.rstrip()
split_word = re.split('\t', lineStrip.lower()) #('/([^a-z])([A-Z]) ([0-9])/n:.;\]+/', line.lower())
tweetTime = split_word[1]
userId = split_word[0]
message = split_word[2]
tweets.append(Tweet(tweetTime, userId, message))
if len(tweets) % 10000 == 0:
print 'read', len(tweets), 'tweets'
except IndexError, e:
print "bad tweet", tweet
except IOError:
print "file not found!"
return tweets
######################DATA ##############
"""
- Need to separate tweets
- Obtain information about each tweet - UserID, Time, words
"""
def writeWordFile(word):
toWrite = 'test.txt'
fileHandle = open ( toWrite, 'w' )
for i in word:
fileHandle.write (i)
def dailyMessages(twt):
dailyMsg =dict ()
for i in twt:
date =i.getDate()
#print i.message
#dailyMsg[date] =messageList
if dailyMsg.has_key(date):
dailyMsg[date].append(twt)
else:
dailyMsg[date] =[twt]
#for k, v in dailyMsg.items():
#print k, v, '\n'
return dailyMsg
"""
Takes dailyTweets and perform word coun.
"""
def dailyWord(tweetsByDay):
dailyTweetsWordCount = { }
for date in tweetsByDay.keys():
dayTweets =tweetsByDay[date]
if len(dayTweets) != 0:
count = wordCount(dayTweets)
dailyTweetsWordCount[date] = count
return dailyTweetsWordCount
def wordCount(tweets):
"""Takes a list of tweets and returns a dictionary of counts for words"""
N = 100
# we'll build a dictionary mapping each word to a SET of users who have used it
wordTweeters = {}
for tweet in tweets:
# print tweet
for i in tweet:
for word in i.message.split():
if not wordTweeters.has_key(word):
wordTweeters[word] = set()
wordTweeters[word].add(i.userId)
# we'll build up a dictionary mapping each word to the number of users who have used it.
p = dict ()
#print wordTweeters
for day in wordTweeters.keys():
usersNo = len (wordTweeters[day])
p[day] = usersNo
#print wordTweeters
return p #wordTweeters, p
def searchForMemes(tweetUs开发者_StackOverflowerCounts):
for key in tweetsUserCounts.keys():
# for pmeme in tweetUserCounts
pass
"""Takes information returned by daily word"""
def isMeme(word, day1Count, day2Count, day3Count):
#takes the daily count
# check if it is a meme
#First - check count
#check count in different days
# determine the if it qualifies as a tweet
# if not drop it do not do below checks
#Second - time stamp
#CHECK ITS TIME TRACK
#How is the count of over time
# rise and fall
#
#Third - user id
# check if is form different users
#how many of those counts are from different users
pass
def dayUserCount(z,word, d1, d2, d3):
""" assume dictionary will be input"""
# check if the word exist in the dictionary
if z.has_key(d1):
date1 =z[d1]
#print value.keys()
if date1.has_key(word):
print date1
c1 =date1[word]
else:
print "word not used in %s"%d1
c1 =0
else:
print 'date does not exist'
if z.has_key(d2):
#print value.keys()
date2 =z[d2]
if date2.has_key(word):
print date2
c2 =date2[word]
else:
print "word not used in %s"%d2
c2 =0
else:
print 'date does not exist'
if z.has_key(d3):
date3 = z[d3]
if date3.has_key(word):
print date3
c3 =date3[word]
else:
print "word not used in %s" %d3
c3 =0
else:
print 'date does not exist'
result = "Word: %s , %s count: %s, %s count: %s, %s count: %s"%(word,d1,c1,d2,c2, d3,c3)
return result
# supportive functions
def hashtag(tw):
hasht =[]
for word in tw.split():
if word.startswith('#'):
hasht.append(word)
return hasht
def httpTag(tw):
http =[]
for word in tw.split():
if word.startswith('http'):
http.append(word)
return http
def reply(tw):
reply =[]
for word in tw.split():
if word.startswith('@'):
reply.append(word)
return reply
def reTweet(tw):
rt =[]
for word in tw.split():
if word.startswith('rt') or word.startswith('RT'):
rt.append(word)
return rt
"""
Old functions
"""
def writeToFile(tweet):
#filename = test.txt
filek = open('test.txt', 'w')
print "writing on the file: "
filek.write(tweet)
# print " _____--------______" + tweet
filek.close()
# count word frequency.
def f2count():
N = 100000000000
words = {}
words_gen = (word.strip(punctuation).lower()
for line in open('c.txt')
for word in line.split())
for word in words_gen:
words[word] = words.get(word, 0) + 1
top_words = sorted(words.iteritems(),
key=lambda(word, count): (-count, word))[:N]
for word, frequency in top_words:
print "%s: %d" % (word, frequency)
if (len(rawline) == 0):
could be written as
if rawline:
You should never use len(rawline) - 1
as an index, just use rawline[-1]
.
I don't know why you use re.split()
, when you could just do linestip.lower().split('\t')
.
Don't use dailyMsg.has_key(date)
, use date in dailyMsg
.
When you iterate over tweetsByDay
, you should really be doing this:
for date, value in tweetsByDay.items():`
that way you don't have to manually bind a value to the key.
That's just a start. There are many more issues to be worked out. I think you really just need to work on mastering Python -- it's clear from reading your code that either Python is not your first language or you learned from a resource that didn't teach you how to write it well. For example, why do you put parentheses around conditionals? That's not necessary in Python (though it's an artifact from Algol-like languages such as C or Java). Why do you use dict()
instead of {}
? It's preferable to write an empty dict the second way. You may find this tutorial on idiomatic Python helpful.
wordCount()
can be run in parallel. Since each tweet does not directly depend on another, there is no reason to iterate over the list serially. Break the list of tweets into smaller lists, and then have a thread work on each sub-list. Once they have all finished creating their sub-dictionaries, you can a little work to combine them all into one dictionary.
EDIT:
An example of how to parallize summing a list. You would change the body of the thread to do whatever your task is.
from threading import Thread
numbers = range(1000)
class Sum(Thread):
def __init__(self, numList):
Thread.__init__(self)
self.numList = numList
self.total = 0
def run(self):
for num in self.numList:
self.total += num
numThreads = 7
threads = []
perThread = len(numbers)/numThreads
for i in xrange(numThreads):
start = i*perThread
t = Sum(numbers[start:len(numbers) if i == numThreads-1 else start+perThread])
t.start()
threads.append(t)
grandTotal = 0
for t in threads:
t.join()
grandTotal += t.total
print grandTotal
This code is full of non optimized snippets.
For exemple, each call of a function takes time. Don't do some useless calls to function, and you will spare time. Some calls can be replaced with comprehension list: hashtag,httpTag, ...etc
I could help to optimize, but:
1 - I have not enough time presently for a long work of this kind
2 - we can't optimize because the code isn't complete: where are the following functions called ? :
readTweets
writeWordFile
dailyMessages
dailyWord
wordCount
searchForMemes
isMeme
dayUserCount
hashtag
httpTag
reply
reTweet
writeToFile
f2count
3 - I'm tired to answer to new registered folk that turn up at stackoverflow with heavy questions and who disappear after that, sometimes without letting any piece of news or comment. Excuse me if you intended to not do so
EDIT
writeToFile
f2count
must evidently be pulled out of the list.
精彩评论