import urllib2 import time # load words from the file def load_file(file_name): file = open(file_name, 'r') list_tv = [] while 1: line = file.readline() if not line: break pass word = line.strip() if len(word) > 0: list_tv.append(word.lower()) return list_tv # save the words of a list def save_file(list_words, file_name): file = open(file_name, 'w') # uniques = set(list_words) # for w in uniques: for w in list_words: file.write(w + '\n') # append the internal language found in the file def append_to_file_ITL(internallang): with open("internallang.txt", "a") as myfile: for itl in internallang: myfile.write(itl + "\n") # save the HTML in file def save_content_file(content): file = open("html.txt", 'w') file.write(content) # read the HTML content of a URL def read_url(word): return urllib2.urlopen("http://thesaurus.com/browse/" + word).read() # get the data from a HTML tag from HTMLParser import HTMLParser class MLStripper(HTMLParser): def __init__(self): self.reset() self.fed = [] def handle_data(self, d): self.fed.append(d) def get_data(self): return ''.join(self.fed) def strip_tags(html): s = MLStripper() s.feed(html) return s.get_data().strip() # valid word only have lowercase letters import re def is_valid(word, search=re.compile(r'[^a-z.]').search): return not bool(search(word)) # extract the language data from the HTML def parseHTML(content): intLang = [] synos = [] ini = 20000 # 34000 while 1: pos1 = content.find("Main Entry:", ini); if pos1 == -1: break pos2 = content.find("Part of Speech:", pos1); if pos2 == -1: break word = strip_tags(content[pos1 + len("Main Entry:"):pos2]).split('\n')[0] if len(word.split(' ')) > 1: ini = pos2 continue pos3 = content.find("Definition:", pos2); if pos3 == -1: break ini = pos3 tipe = strip_tags(content[pos2 + len("Part of Speech:"):pos3]).split('\n')[0] if tipe == 'verb' or tipe == 'adjective' or tipe == 'adverb': codtype = "J" if tipe == 'verb': codtype = "V" if tipe == 'adverb': codtype = "B" intLang.append("WORD " + codtype + " " + word + " " + word) continue if tipe != 'noun': continue pos4 = content.find("Synonyms:", pos3); if pos4 == -1: break defi = strip_tags(content[pos3 + len("Definition:"):pos4]) words = word.split("/") for wd in words: intLang.append(wd + " is a " + defi) pos5 = content.find("", pos4); if pos5 == -1: break pos6 = content.find("Antonyms:", pos4); if pos6 > 0 and pos6