<?php
/*
title: PHP_SuperEzSearch - A simple sequence search engine
class: PHP_SuperEzSearch
version: 1.0
license: BSD
author: https://www.phpclasses.org/browse/author/144301.html
category: search
tags: search, engine, sequence, matching, document, phrase, negative, query, exclude, save, load, binary, compressed, session, related, documents
short description:
This class is a simple search engine that uses sequence matching of two character sequences to match documents and documents to phrases.
It allows for negative queries to be used to exclude documents from the search results.
You can save/load the document database that you generate using the class as a binary, compressed or in session.
You can search for a phrase that finds matches in the documents.
You can also find related documents.
long description:
The PHP_SuperEzSearch class is a versatile search engine that utilizes sequence matching of two character sequences to efficiently match documents and phrases. This powerful tool allows users to employ negative queries, effectively excluding specific documents from the search results. With options to save and load the document database in binary, compressed, or session formats, this search engine is highly adaptable for various applications. Whether searching for phrases within documents or identifying related documents, the PHP_SuperEzSearch class is a valuable asset for developers seeking a user-friendly search solution.
Notes:
- i think negative search is working nice enough.
Public Functions:
public function __construct()
// function save($file_path) :: save data serialized to binary file
// function load($file_path) :: load data from binary file
// function save_compressed($file_path) :: save data serialized to compressed file
// function load_compressed($file_path) :: load data from compressed file
// function save_to_session($session_name) :: save data serialized to session
// function load_from_session($session_name) :: load data from session
// function returnRelatedDocuments($document_id, $numResults=5) :: return related documents (based on content match with other documents)
// function indexDocuments($documentArray) :: index an array of documents (add multiple documents at once)
// function addDocument($id, $content) :: add a document to the index
// function removeDocument($id) :: remove a document from the index
// function getDocument($id) :: give an id, get a document (or not)
// function search($query, $numResults = 5, $negativeKeywords=""):: search for query in documents. Takes a negative query as well.
Private Functions:
// todo: add private functions here
*/
class PHP_SuperEzSearch
{
private $documents;
//private $documentVectors;
public $documentVectors;
// private $dictionary;
// public $dictionary; // for future bpe encoding ?
private $docCount;
public function __construct()
{
$this->documents = [];
$this->documentVectors = [];
// $this->dictionary = [];
$this->docCount = 0;
}
// function save($file_path) :: save data serialized to binary file
public function save($file_path)
{
$save['documents'] = $this->documents;
$save['documentVectors'] = $this->documentVectors;
$save['docCount'] = $this->docCount;
$serialized = serialize($save);
file_put_contents($file_path, $serialized, LOCK_EX); # saves serialized binary data to file
}
// function load($file_path) :: load data from binary file
public function load($file_path)
{
$serialized = file_get_contents($file_path);
$save = unserialize($serialized);
$this->documents = $save['documents'];
$this->documentVectors = $save['documentVectors'];
$this->docCount = $save['docCount'];
}
// function save_compressed($file_path) :: save data serialized to compressed binary file
public function save_compressed($file_path)
{
$save['documents'] = $this->documents;
$save['documentVectors'] = $this->documentVectors;
$save['docCount'] = $this->docCount;
$serialized = serialize($save);
$compressed = gzcompress($serialized);
// lock file when saving
file_put_contents($file_path, $compressed, LOCK_EX); # saves serialized binary data to file
}
// function load_compressed($file_path) :: load data from compressed binary file
public function load_compressed($file_path)
{
$compressed = file_get_contents($file_path);
$serialized = gzuncompress($compressed);
$save = unserialize($serialized);
$this->documents = $save['documents'];
$this->documentVectors = $save['documentVectors'];
$this->docCount = $save['docCount'];
}
// function save_to_session($session_name) :: save data serialized to session
public function save_to_session($session_name)
{
$save['documents'] = $this->documents;
$save['documentVectors'] = $this->documentVectors;
$save['docCount'] = $this->docCount;
$serialized = serialize($save);
$_SESSION[$session_name] = $serialized;
}
// function load_from_session($session_name) :: load data from session
public function load_from_session($session_name)
{
$serialized = $_SESSION[$session_name];
$save = unserialize($serialized);
$this->documents = $save['documents'];
$this->documentVectors = $save['documentVectors'];
$this->docCount = $save['docCount'];
}
// function returnRelatedDocuments($document_id, $numResults=5) :: return related documents (based on content match with other documents)
public function returnRelatedDocuments($document_id, $numResults=5)
{
// fetch content of document_id and compare it to the rest of the documents and return the results
$contents = $this->getDocument($document_id);
if(!$contents)
return false;
else
return $this->search($contents, $numResults);
#$results = $this->search
}
// function indexDocuments($documentArray) :: index an array of documents (add multiple documents at once)
public function indexDocuments($documentArray)
{
foreach ($documentArray as $id => $content) {
$this->addDocument($id, $content);
}
}
// function addDocument($id, $content) :: add a document to the index
public function addDocument($id, $content)
{
// print("adding document $id");
$this->documents[$id] = $content;
$this->documentVectors[$id] = $this->calculateVector($content);
// print("---");
// print_r($this->documentVectors[$id]);
$this->docCount++;
}
// function removeDocument($id) :: remove a document from the index
public function removeDocument($id)
{
unset($this->documents[$id]);
unset($this->documentVectors[$id]);
$this->docCount--;
}
// function getDocument($id) :: give an id, get a document (or not)
public function getDocument($id)
{
if(!empty($this->documents[$id]))
return $this->documents[$id];
else
return false;
}
// function search($query, $numResults = 5, $negativeKeywords=""):: search for query in documents. Takes a negative query as well.
public function search($query, $numResults = 5, $negativeKeywords="")
{
$queryVector = $this->calculateVector($query);
$similarities = [];
if (empty($queryVector)) {
return [];
}
$nr = [];
if(!empty($negativeKeywords))
{
$nr = $this->search($negativeKeywords, $numResults);
// echo "negative results for $negativeKeywords:";
// print_r($nr);
}
// Calculate cosine similarity between query vector and all document vectors
// foreach ($this->documentVectors as $id => $vector) {
// $similarities[$id] = $this->cosineSimilarity($queryVector, $vector);
// }
# rewrote to accomodate negative search
foreach ($this->documentVectors as $id => $vector) {
$similarities[$id] = $this->cosineSimilarity($queryVector, $vector);
if(empty($negativeKeywords))
{
// if id in negative results, adjust its score by
} else {
// do a search on the negative keyword based on the numresults
// $negQueryVector = $this->calculateVector($negativeKeywords);
// // max penalty will be -1 for an exact match between the negative keywords
// // and the document vector.
// // else just adjust the penalty based on 1 - cosine similarity
// $penalty = $this->cosineSimilarity($negQueryVector, $vector) - 1;
// // echo "penalty for $id is $penalty<br />";
// $similarities[$id] = $this->cosineSimilarity($queryVector, $vector) - $penalty;
// die(print_r($this->cosineSimilarity($queryVector, $vector),1));
}
}
if(!empty($nr) && is_array($nr))
{
// echo "NEGATIVE SEARCH: Q: $query N: $negativeKeywords<br />\r\n";
// echo "NR #: ".count($nr)."<br />\r\n";
foreach($nr as $n)
{
$id = $n['id'];
$sim = $n['similarity'];
// echo "ID: $id SIM: $sim<br />\r\n";
if(isset($similarities[$id]))
{
// echo "\r\nSimilarities:".$similarities[$id];
// $similarities[$id] = ($similarities[$id] + $sim) / 2;
$similarities[$id] = $similarities[$id] - $sim;
}
}
// print_r($nr);
// die(print_r($nr));
}
/*
$nr = Array (
[0] => Array
(
[id] => /folder/folder/filename.txt
[content] => I like chicken on my potato salad, said the dog.
[similarity] => 0.22941573387056
)
[1] => Array
(
[id] => 6
[content] => The lazy brown dog is slower than the quick brown fox
[similarity] => 0.18786728732554
)
[2] => Array
(
[id] => 5
[content] => The quick brown fox is faster than the lazy dog
[similarity] => 0.14547859349066
)
[3] => Array
(
[id] => c
[content] => Yet another sample document with different content.
[similarity] => 0.12830005981992
)
[4] => Array
(
[id] => 4
[content] => A quick brown fox and a quick brown dog are friends
[similarity] => 0.12379689211803
)
);
$similarities = Array
(
[a] => 0.261839335716
[b] => 0.41684008457512
[c] => 0.21693045781866
[1] => 0.80574170046929
[2] => 0.80574170046929
[3] => 0.51679433316545
[4] => 0.61632073699103
[/folder/folder/filename.txt] => 0.38789740432855
[/folder/x.nfo] => 0.40748009062668
[5] => 0.88824673349835
[6] => 1
)
*/
arsort($similarities);
$topResults = array_slice($similarities, 0, $numResults, true);
$results = [];
foreach ($topResults as $id => $similarity) {
$results[] = [
'id' => $id,
'content' => $this->documents[$id],
'similarity' => $similarity,
];
}
// if(!empty($negativeKeywords))
// die("THE RESULTS:[$query][$negativeKeywords][\r\n".print_r($results,1));
return $results;
} // end search
// function calculateVector($content) :: function to calculate the vector of a string
private function calculateVector($content)
{
$words = $this->tokenize($content);
$wordCounts = array_count_values($words);
$vector = [];
foreach ($wordCounts as $word => $count) {
$vector[$word] = $count;
}
return $vector;
// return $this->normalize($vector);
}
// function tokenize($content) :: function to tokenize a string into 2 character sequence pairs
private function tokenize($content)
{
$content = strtolower($content);
// we are going to split the string into 2 character pairs
// first we pad the string if it is odd
if (strlen($content) % 2 != 0)
$content = $content . " ";
// now we loop through each character and start building the array
// so whatever char at index and the char before it will be a token
for($i = 1; $i < strlen($content); $i++)
$tokens[] = $content[$i-1] . $content[$i];
return $tokens;
}
// function normalize($vector) :: normalize vector
private function normalize($vector)
{
$total = 0;
foreach ($vector as $entry) {
$total += $entry * $entry;
}
$total = sqrt($total);
foreach ($vector as &$entry) {
$entry = $entry / $total;
}
return $vector;
}
// function cosineSimilarity($vectorA, $vectorB) :: function to calculate cosine similarity between 2 vectors.
private function cosineSimilarity($vectorA, $vectorB)
{
$dotProduct = $this->dotProduct($vectorA, $vectorB);
$magnitudeA = $this->magnitude($vectorA);
$magnitudeB = $this->magnitude($vectorB);
return $dotProduct / ($magnitudeA * $magnitudeB);
}
// function dotProduct($vectorA, $vectorB) :: function to calculate dot product of 2 vectors.
private function dotProduct($vectorA, $vectorB)
{
$product = 0;
foreach ($vectorA as $word => $count)
if (isset($vectorB[$word]))
$product += $count * $vectorB[$word];
return $product;
}
// function magnitude($vector) :: function to calculate magnitude of a vector.
private function magnitude($vector)
{
$magnitude = 0;
foreach ($vector as $count)
$magnitude += $count * $count;
return sqrt($magnitude);
}
} // end class
?>
|