I have a Python script that takes in '.html' files removes stop words and returns all other words in a python dictionary. But if the same word occurs in multiple files I want it to return only once. i.e. contain non-stop words, each only once.
def run():
filelist = os.listdir(path)
regex = re.compile(r'.*<div class="body">(.*?)</div>.*', re.DOTALL | re.IGNORECASE)
reg1 = re.compile(r'<\/?[ap][^>]*>', re.DOTALL | re.IGNORECASE)
quotereg = re.compile(r'"', re.DOTALL | re.IGNORECASE)
puncreg = re.compile(r'[^\w]', re.DOTALL | re.IGNORECASE)
f = open(stopwordfile, 'r')
stopwords = f.read().lower().split()
filewords = {}
htmlfiles = []
for file in filelist:
if file[-5:] == '.html':
htmlfiles.append(file)
totalfreq = {}
for file in htmlfiles:
f = open(path + file, 'r')
words = f.read().lower()
words = regex.findall(words)[0]
words = quotereg.sub(' ', words)
words = reg1.sub(' ', words)
words = puncreg.sub(' ', words)
words = words.strip().split()
for w in stopwords:
while w in words:
words.remove(w)
freq = {}
for w in words:
words开发者_开发技巧=words
print words
if __name__ == '__main__':
run()
Use a set. Simply add every word you find to the set; it ignores duplicates.
Assuming you have an iterator that returns each word in a file (this is for plain text; HTML would be rather more complicated):
def words(filename):
with open(filename) as wordfile:
for line in wordfile:
for word in line.split():
yield word
Then getting them into a set
is simple:
wordlist = set(words("words.txt"))
If you have multiple files, just do like so:
wordlist = set()
wordfiles = ["words1.txt", "words2.txt", "words3.txt"]
for wordfile in wordfiles:
wordlist |= set(words(wordfile))
You can also use a set for your stop words. Then you can simply subtract them from the word list after the fact, which will probably be faster than checking to see if each word is a stop word before adding.
stopwords = set(["a", "an", "the"])
wordlist -= stopwords
精彩评论