PHP Classes

File: dictionary.py

Recommend this page to a friend!
  Classes of Ravindu Taveesha   Non-Word PHP Spell Checker   dictionary.py   Download  
File: dictionary.py
Role: Auxiliary data
Content type: text/plain
Description: Auxiliary data
Class: Non-Word PHP Spell Checker
Detect incorrectly spelled words and suggest fixes
Author: By
Last change:
Date: 4 years ago
Size: 3,343 bytes
 

Contents

Class file image Download
import nltk from nltk.tokenize import sent_tokenize, word_tokenize import json import PyPDF2 import textract from urllib.request import urlopen dictionary = {} text = "" urls = [ 'http://www.gutenberg.org/files/56006/56006-0.txt', 'http://www.gutenberg.org/cache/epub/2776/pg2776.txt', 'http://www.gutenberg.org/cache/epub/17090/pg17090.txt', 'http://www.gutenberg.org/files/3400/3400-0.txt', 'http://www.gutenberg.org/cache/epub/23531/pg23531.txt', 'http://www.gutenberg.org/files/38046/38046-0.txt', 'http://www.gutenberg.org/cache/epub/27250/pg27250.txt', 'http://www.gutenberg.org/cache/epub/41189/pg41189.txt', 'http://www.gutenberg.org/cache/epub/49739/pg49739.txt', 'http://www.gutenberg.org/cache/epub/1319/pg1319.txt', 'http://www.gutenberg.org/files/1289/1289-0.txt', 'http://www.gutenberg.org/files/98/98-0.txt', 'http://www.gutenberg.org/cache/epub/2542/pg2542.txt', 'http://www.gutenberg.org/cache/epub/345/pg345.txt' ] # for url in urls: for url in urls: text += urlopen(url).read().decode('utf8') #fileHandler = open('gutenberg.txt') #text = fileHandler.read() print('==============> text count %s ' % len(text)) #text = "Hello there, how are you doing today? The weather is great and THE Python is awesome. The sky is pinkish, do not eat bread. How old are you. you done it your way" tokens = word_tokenize(text) # remove punctuations and lowercase all words words = [word.lower() for word in tokens if word.isalpha()] count = 0 print('==============> unigram start') unigramDict = {} for word in words: print('==============> add word') if word not in unigramDict: unigramDict[word] = 1 else: print('==============> update word ' + word + ' frequency') unigramDict[word] += 1 count += 1 dictionary['unigram'] = unigramDict print('==============> %s word added to bigram ' % len(unigramDict)) # length dictionary print('==============> length start') lengthDict = {} # loop throught dictionary and update similar length words for word in words: if len(word) not in lengthDict.keys(): print('==============> add length %s ' % len(word)) lengthDict[len(word)] = [word] else: if word not in lengthDict[len(word)]: print('==============> add word => length %s ' % word) lengthDict[len(word)].append(word) dictionary['length'] = lengthDict # bigram dictionary print('==============> start bigram dictionary') bigrams = list(nltk.bigrams(words)) bigramDict = {} for phrase in bigrams: print('==============> add word') first = phrase[0] second = phrase[1] if first not in bigramDict: bigramDict[first] = {} bigramDict[first][second] = 1 else: if second in bigramDict[first].keys(): print('==============> update phrase ' + second + ' frequency') bigramDict[first][second] += 1 else: print('==============> add phrase ' + second + ' ==> ' + first) bigramDict[first][second] = 1 dictionary['bigram'] = bigramDict # # creat unigram dictionary with open('dictionary.json', 'w') as f: json.dump(dictionary, f) print('==============> dictionary created') print('==============> total words %s ' % count) print('==============> %s word added' % len(unigramDict))