import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import json
import PyPDF2
import textract
from urllib.request import urlopen
dictionary = {}
text = ""
urls = [
'http://www.gutenberg.org/files/56006/56006-0.txt',
'http://www.gutenberg.org/cache/epub/2776/pg2776.txt',
'http://www.gutenberg.org/cache/epub/17090/pg17090.txt',
'http://www.gutenberg.org/files/3400/3400-0.txt',
'http://www.gutenberg.org/cache/epub/23531/pg23531.txt',
'http://www.gutenberg.org/files/38046/38046-0.txt',
'http://www.gutenberg.org/cache/epub/27250/pg27250.txt',
'http://www.gutenberg.org/cache/epub/41189/pg41189.txt',
'http://www.gutenberg.org/cache/epub/49739/pg49739.txt',
'http://www.gutenberg.org/cache/epub/1319/pg1319.txt',
'http://www.gutenberg.org/files/1289/1289-0.txt',
'http://www.gutenberg.org/files/98/98-0.txt',
'http://www.gutenberg.org/cache/epub/2542/pg2542.txt',
'http://www.gutenberg.org/cache/epub/345/pg345.txt'
]
# for url in urls:
for url in urls:
text += urlopen(url).read().decode('utf8')
#fileHandler = open('gutenberg.txt')
#text = fileHandler.read()
print('==============> text count %s ' % len(text))
#text = "Hello there, how are you doing today? The weather is great and THE Python is awesome. The sky is pinkish, do not eat bread. How old are you. you done it your way"
tokens = word_tokenize(text)
# remove punctuations and lowercase all words
words = [word.lower() for word in tokens if word.isalpha()]
count = 0
print('==============> unigram start')
unigramDict = {}
for word in words:
print('==============> add word')
if word not in unigramDict:
unigramDict[word] = 1
else:
print('==============> update word ' + word + ' frequency')
unigramDict[word] += 1
count += 1
dictionary['unigram'] = unigramDict
print('==============> %s word added to bigram ' % len(unigramDict))
# length dictionary
print('==============> length start')
lengthDict = {}
# loop throught dictionary and update similar length words
for word in words:
if len(word) not in lengthDict.keys():
print('==============> add length %s ' % len(word))
lengthDict[len(word)] = [word]
else:
if word not in lengthDict[len(word)]:
print('==============> add word => length %s ' % word)
lengthDict[len(word)].append(word)
dictionary['length'] = lengthDict
# bigram dictionary
print('==============> start bigram dictionary')
bigrams = list(nltk.bigrams(words))
bigramDict = {}
for phrase in bigrams:
print('==============> add word')
first = phrase[0]
second = phrase[1]
if first not in bigramDict:
bigramDict[first] = {}
bigramDict[first][second] = 1
else:
if second in bigramDict[first].keys():
print('==============> update phrase ' + second + ' frequency')
bigramDict[first][second] += 1
else:
print('==============> add phrase ' + second + ' ==> ' + first)
bigramDict[first][second] = 1
dictionary['bigram'] = bigramDict
# # creat unigram dictionary
with open('dictionary.json', 'w') as f:
json.dump(dictionary, f)
print('==============> dictionary created')
print('==============> total words %s ' % count)
print('==============> %s word added' % len(unigramDict))
|