File: dictionary.py

Recommend this page to a friend!

dictionary.py

File:	`dictionary.py`
Role:	Auxiliary data
Content type:	`text/plain`
Description:	Auxiliary data
Class:	Non-Word PHP Spell Checker Detect incorrectly spelled words and suggest fixes
Author:	By Ravindu Taveesha
Last change:
Date:	5 years ago
Size:	`3,343 bytes`

Download

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import json
import PyPDF2
import textract
from urllib.request import urlopen

dictionary = {}
text = ""

urls = [
    'http://www.gutenberg.org/files/56006/56006-0.txt',
    'http://www.gutenberg.org/cache/epub/2776/pg2776.txt',
    'http://www.gutenberg.org/cache/epub/17090/pg17090.txt',
    'http://www.gutenberg.org/files/3400/3400-0.txt',
    'http://www.gutenberg.org/cache/epub/23531/pg23531.txt',
    'http://www.gutenberg.org/files/38046/38046-0.txt',
    'http://www.gutenberg.org/cache/epub/27250/pg27250.txt',
    'http://www.gutenberg.org/cache/epub/41189/pg41189.txt',
    'http://www.gutenberg.org/cache/epub/49739/pg49739.txt',
    'http://www.gutenberg.org/cache/epub/1319/pg1319.txt',
    'http://www.gutenberg.org/files/1289/1289-0.txt',
    'http://www.gutenberg.org/files/98/98-0.txt',
    'http://www.gutenberg.org/cache/epub/2542/pg2542.txt',
    'http://www.gutenberg.org/cache/epub/345/pg345.txt'

]

# for url in urls:
for url in urls:
    text += urlopen(url).read().decode('utf8')

#fileHandler = open('gutenberg.txt')
#text = fileHandler.read()

print('==============> text count %s ' % len(text))

#text = "Hello there, how are you doing today? The weather is great and THE Python is awesome. The sky is pinkish, do not eat bread. How old are you. you done it your way"
tokens = word_tokenize(text)
# remove punctuations and lowercase all words
words = [word.lower() for word in tokens if word.isalpha()]
count = 0

print('==============> unigram start')
unigramDict = {}
for word in words:
    print('==============> add word')
    if word not in unigramDict:
        unigramDict[word] = 1
    else:
        print('==============> update word ' + word + ' frequency')
        unigramDict[word] += 1
    count += 1


dictionary['unigram'] = unigramDict
print('==============> %s word added to bigram ' % len(unigramDict))

# length dictionary
print('==============> length start')
lengthDict = {}
# loop throught dictionary and update similar length words
for word in words:
    if len(word) not in lengthDict.keys():
        print('==============> add length %s ' % len(word))
        lengthDict[len(word)] = [word]
    else:
        if word not in lengthDict[len(word)]:
            print('==============> add word => length %s ' % word)
            lengthDict[len(word)].append(word)

dictionary['length'] = lengthDict

# bigram dictionary
print('==============> start bigram dictionary')
bigrams = list(nltk.bigrams(words))
bigramDict = {}

for phrase in bigrams:
    print('==============> add word')
    first = phrase[0]
    second = phrase[1]
    if first not in bigramDict:
        bigramDict[first] = {}
        bigramDict[first][second] = 1
    else:
        if second in bigramDict[first].keys():
            print('==============> update phrase ' + second + ' frequency')
            bigramDict[first][second] += 1
        else:
            print('==============> add phrase ' + second + ' ==> ' + first)
            bigramDict[first][second] = 1

dictionary['bigram'] = bigramDict

# # creat unigram dictionary
with open('dictionary.json', 'w') as f:
    json.dump(dictionary, f)

print('==============> dictionary created')
print('==============> total words %s ' % count)
print('==============> %s word added' % len(unigramDict))

About us

Advertise on this site

For more information send a message to info at phpclasses dot org.

File: dictionary.py

Contents