<?php
/*
Example usage of the class NgramComparator
Ngrams are a way of breaking up a string into chunks of n characters.
They allow us to compare strings for similarity, even if the strings are of different lengths,
or have some words in common but not others.
*/
require_once('class.ngram.php');
$comparator = new NgramComparator();
// Example usage of get_ngrams
$text = "The quick brown fox jumps over the lazy dog";
$ngrams = $comparator->get_ngrams($text, 3);
print_r($ngrams); // Output: Array ( [0] => The [1] => he [2] => e q [3] => qu [4] => qui [5] => uic [6] => ick [7] => ck [8] => k b [9] => br [10] => bro [11] => row [12] => ow [13] =>w f [14] => fo [15] => fox [16] => ox [17] => jum [18] => ump [19] =>mps [20] => ps [21] => ove [22] =>ver [23] =>er [24] =>the [25] => he [26] => laz [27] =>azy [28] =>zy [29] =>dog)
// Example usage of compare_strings_ngram_pct
$string1 = "The quick brown fox jumps over the lazy dog";
$string2 = "The lazy dog jumps over the quick brown fox";
$percentage_match = $comparator->compare_strings_ngram_pct($string1, $string2, 3);
echo "Percentage match: " . $percentage_match . "%\n"; // Output: Percentage match: 95.121951219512%
// Example usage of compare_strings_ngram_max_size
$string1 = "The quick brown fox jumps over the lazy dog";
$string2 = "The lazy dog jumps over the quick brown fox";
$max_matching_ngram_size = $comparator->compare_strings_ngram_max_size($string1, $string2);
echo "Max matching n-gram size: " . $max_matching_ngram_size . "\n"; // Output: Max matching n-gram size: 18
// Example usage of get_shingles
$text = "The quick brown fox jumps over the lazy dog";
$shingle_size = 2;
$shingles = $comparator->get_shingles($text, $shingle_size);
print_r($shingles); // Output: Array ( [0] => The quick [1] => quick brown [2] => brown fox [3] => fox jumps [4] => jumps over [5] => over the [6] => the lazy [7] => lazy dog )
// Example usage of train_ngram_model
$tokenized_text = [
['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'],
['The', 'lazy', 'dog', 'jumps', 'over', 'the', 'quick', 'brown', 'fox']
];
$n = 3;
$ngram_counts = $comparator->train_ngram_model($tokenized_text, $n);
print_r($ngram_counts); // Output: Array ( [The quick brown] => 1 [quick brown fox] => 1 [brown fox jumps] => 1 [fox jumps over] => 1 [jumps over the] => 2 [over the lazy] => 1 [the lazy dog] => 1 [lazy dog jumps] => 1 [dog jumps over] => 1 [over the quick] => 1 [the quick brown] => 1 [brown fox] => 2 [fox jumps] => 2 [jumps over] => 2 [over the] => 2 [the lazy] => 1 [lazy dog] => 1 [dog jumps] => 1 [over the] => 2 [the quick] => 1)
?>
|