File: class.phpsuperezsearch.php

Recommend this page to a friend!

Classes of JImmy Bo

PHP SuperEzSearch

class.phpsuperezsearch.php

File:	`class.phpsuperezsearch.php`
Role:	Class source
Content type:	`text/plain`
Description:	the class file for php superezsearch
Class:	PHP SuperEzSearch Search documents storing indexes in PHP structures
Author:	By JImmy Bo
Last change:	info
Date:	1 year ago
Size:	`16,191 bytes`

Contents

<?php
    /*
        title:      PHP_SuperEzSearch - A simple sequence search engine
        class:      PHP_SuperEzSearch
        version:    1.0
        license:    BSD
        author:     https://www.phpclasses.org/browse/author/144301.html
        category:   search
        
        tags: search, engine, sequence, matching, document, phrase, negative, query, exclude, save, load, binary, compressed, session, related, documents
        short description:
            This class is a simple search engine that uses sequence matching of two character sequences to match documents and documents to phrases.
            It allows for negative queries to be used to exclude documents from the search results.
            You can save/load the document database that you generate using the class as a binary, compressed or in session.
            You can search for a phrase that finds matches in the documents.
            You can also find related documents.
        long description:
            The PHP_SuperEzSearch class is a versatile search engine that utilizes sequence matching of two character sequences to efficiently match documents and phrases. This powerful tool allows users to employ negative queries, effectively excluding specific documents from the search results. With options to save and load the document database in binary, compressed, or session formats, this search engine is highly adaptable for various applications. Whether searching for phrases within documents or identifying related documents, the PHP_SuperEzSearch class is a valuable asset for developers seeking a user-friendly search solution.
    
        Notes: 
            - i think negative search is working nice enough.

        Public Functions:
            public function __construct()
            // function save($file_path) :: save data serialized to binary file
            // function load($file_path) :: load data from binary file
            // function save_compressed($file_path) :: save data serialized to compressed file
            // function load_compressed($file_path) :: load data from compressed file
            // function save_to_session($session_name) :: save data serialized to session
            // function load_from_session($session_name) :: load data from session
            // function returnRelatedDocuments($document_id, $numResults=5) :: return related documents (based on content match with other documents)
            // function indexDocuments($documentArray) :: index an array of documents (add multiple documents at once)
            // function addDocument($id, $content) :: add a document to the index
            // function removeDocument($id) :: remove a document from the index
            // function getDocument($id) :: give an id, get a document (or not)
            // function search($query, $numResults = 5, $negativeKeywords=""):: search for query in documents. Takes a negative query as well.


        Private Functions:
            // todo: add private functions here 
    */

    class PHP_SuperEzSearch
    {
        private $documents;
        //private $documentVectors;
        public $documentVectors;
        // private $dictionary;
        // public $dictionary; // for future bpe encoding ?
        private $docCount;

        public function __construct()
        {
            $this->documents = [];
            $this->documentVectors = [];
            // $this->dictionary = [];
            $this->docCount = 0;
        }

        // function save($file_path) :: save data serialized to binary file
        public function save($file_path)
        {
            $save['documents'] = $this->documents;
            $save['documentVectors'] = $this->documentVectors;
            $save['docCount'] = $this->docCount;
            $serialized = serialize($save);
            file_put_contents($file_path, $serialized, LOCK_EX); # saves serialized binary data to file
        }

        // function load($file_path) :: load data from binary file
        public function load($file_path)
        {
            $serialized = file_get_contents($file_path);
            $save = unserialize($serialized);
            $this->documents = $save['documents'];
            $this->documentVectors = $save['documentVectors'];
            $this->docCount = $save['docCount'];
        }

        // function save_compressed($file_path) :: save data serialized to compressed binary file
        public function save_compressed($file_path)
        {
            $save['documents'] = $this->documents;
            $save['documentVectors'] = $this->documentVectors;
            $save['docCount'] = $this->docCount;
            $serialized = serialize($save);
            $compressed = gzcompress($serialized);
            // lock file when saving

            file_put_contents($file_path, $compressed, LOCK_EX); # saves serialized binary data to file
        }

        // function load_compressed($file_path) :: load data from compressed binary file
        public function load_compressed($file_path)
        {
            $compressed = file_get_contents($file_path);
            $serialized = gzuncompress($compressed);
            $save = unserialize($serialized);
            $this->documents = $save['documents'];
            $this->documentVectors = $save['documentVectors'];
            $this->docCount = $save['docCount'];
        }

        // function save_to_session($session_name) :: save data serialized to session
        public function save_to_session($session_name)
        {
            $save['documents'] = $this->documents;
            $save['documentVectors'] = $this->documentVectors;
            $save['docCount'] = $this->docCount;
            $serialized = serialize($save);
            $_SESSION[$session_name] = $serialized;
        }

        // function load_from_session($session_name) :: load data from session
        public function load_from_session($session_name)
        {
            $serialized = $_SESSION[$session_name];
            $save = unserialize($serialized);
            $this->documents = $save['documents'];
            $this->documentVectors = $save['documentVectors'];
            $this->docCount = $save['docCount'];
        }   

        // function returnRelatedDocuments($document_id, $numResults=5) :: return related documents (based on content match with other documents)
        public function returnRelatedDocuments($document_id, $numResults=5)
        {
            // fetch content of document_id and compare it to the rest of the documents and return the results
            $contents = $this->getDocument($document_id);
        
            if(!$contents)
                return false;
            else
                return $this->search($contents, $numResults);
            #$results = $this->search
        }

        // function indexDocuments($documentArray) :: index an array of documents (add multiple documents at once)
        public function indexDocuments($documentArray)
        {
            foreach ($documentArray as $id => $content) {
                $this->addDocument($id, $content);
            }
        }

        // function addDocument($id, $content) :: add a document to the index
        public function addDocument($id, $content)
        {
            // print("adding document $id");
            $this->documents[$id] = $content;
            $this->documentVectors[$id] = $this->calculateVector($content);
            // print("---");
            // print_r($this->documentVectors[$id]);
            $this->docCount++;
        }
        
        // function removeDocument($id) :: remove a document from the index
        public function removeDocument($id)
        {
            unset($this->documents[$id]);
            unset($this->documentVectors[$id]);
            $this->docCount--;
        }

        // function getDocument($id) :: give an id, get a document (or not)
        public function getDocument($id)
        {
            if(!empty($this->documents[$id]))
                return $this->documents[$id];
            else
                return false;
        }

        // function search($query, $numResults = 5, $negativeKeywords=""):: search for query in documents. Takes a negative query as well.
        public function search($query, $numResults = 5, $negativeKeywords="")
        {
            $queryVector = $this->calculateVector($query);
            $similarities = [];

            if (empty($queryVector)) {
                return [];
            }

            $nr = [];

            if(!empty($negativeKeywords))
            {
                $nr = $this->search($negativeKeywords, $numResults);
                // echo "negative results for $negativeKeywords:";
                // print_r($nr);
            }

            // Calculate cosine similarity between query vector and all document vectors
            // foreach ($this->documentVectors as $id => $vector) {
            //     $similarities[$id] = $this->cosineSimilarity($queryVector, $vector);
            // }
            # rewrote to accomodate negative search
            foreach ($this->documentVectors as $id => $vector) {

                $similarities[$id] = $this->cosineSimilarity($queryVector, $vector);

                if(empty($negativeKeywords))
                {
                    // if id in negative results, adjust its score by 
                } else {
                    // do a search on the negative keyword based on the numresults

                    // $negQueryVector = $this->calculateVector($negativeKeywords);
                    // // max penalty will be -1 for an exact match between the negative keywords
                    // // and the document vector.
                    // // else just adjust the penalty based on 1 - cosine similarity
                    // $penalty = $this->cosineSimilarity($negQueryVector, $vector) - 1;

                    // // echo "penalty for $id is $penalty<br />";
                    // $similarities[$id] = $this->cosineSimilarity($queryVector, $vector) - $penalty;
                    // die(print_r($this->cosineSimilarity($queryVector, $vector),1));
                }
            }

            if(!empty($nr) && is_array($nr))
            {
                // echo "NEGATIVE SEARCH: Q: $query N: $negativeKeywords<br />\r\n";
                // echo "NR #: ".count($nr)."<br />\r\n";
                foreach($nr as $n)
                {
                    $id = $n['id'];
                    $sim = $n['similarity'];
                    // echo "ID: $id SIM: $sim<br />\r\n";

                    if(isset($similarities[$id]))
                    {
                        // echo "\r\nSimilarities:".$similarities[$id];
                        // $similarities[$id] = ($similarities[$id] + $sim) / 2;
                        $similarities[$id] = $similarities[$id] - $sim;
                    }


                }
                // print_r($nr);

                // die(print_r($nr));
            }
            /*
            $nr = Array (
    [0] => Array
        (
            [id] => /folder/folder/filename.txt
            [content] => I like chicken on my potato salad, said the dog.
            [similarity] => 0.22941573387056
        )

    [1] => Array
        (
            [id] => 6
            [content] => The lazy brown dog is slower than the quick brown fox
            [similarity] => 0.18786728732554
        )

    [2] => Array
        (
            [id] => 5
            [content] => The quick brown fox is faster than the lazy dog
            [similarity] => 0.14547859349066
        )

    [3] => Array
        (
            [id] => c
            [content] => Yet another sample document with different content.
            [similarity] => 0.12830005981992
        )

    [4] => Array
        (
            [id] => 4
            [content] => A quick brown fox and a quick brown dog are friends
            [similarity] => 0.12379689211803
        )

            );
           $similarities =  Array
(
    [a] => 0.261839335716
    [b] => 0.41684008457512
    [c] => 0.21693045781866
    [1] => 0.80574170046929
    [2] => 0.80574170046929
    [3] => 0.51679433316545
    [4] => 0.61632073699103
    [/folder/folder/filename.txt] => 0.38789740432855
    [/folder/x.nfo] => 0.40748009062668
    [5] => 0.88824673349835
    [6] => 1
)
            */
            
            arsort($similarities);
            $topResults = array_slice($similarities, 0, $numResults, true);

            $results = [];
            foreach ($topResults as $id => $similarity) {
                $results[] = [
                    'id' => $id,
                    'content' => $this->documents[$id],
                    'similarity' => $similarity,
                ];
            }

            // if(!empty($negativeKeywords))
            //     die("THE RESULTS:[$query][$negativeKeywords][\r\n".print_r($results,1));

            return $results;
        } // end search



        // function calculateVector($content) :: function to calculate the vector of a string
        private function calculateVector($content)
        {
            $words = $this->tokenize($content);
            $wordCounts = array_count_values($words);
            $vector = [];

            foreach ($wordCounts as $word => $count) {
                $vector[$word] = $count;
            }

            return $vector;
            // return $this->normalize($vector);
        }

        // function tokenize($content) :: function to tokenize a string into 2 character sequence pairs
        private function tokenize($content)
        {
            $content = strtolower($content);
            // we are going to split the string into 2 character pairs
            // first we pad the string if it is odd
            if (strlen($content) % 2 != 0) 
                $content = $content . " ";
            
            // now we loop through each character and start building the array
            // so whatever char at index and the char before it will be a token
            for($i = 1; $i < strlen($content); $i++) 
                $tokens[] = $content[$i-1] . $content[$i];

            return $tokens;
        }


        // function normalize($vector) :: normalize vector 
        private function normalize($vector)
        {
            $total = 0;
            foreach ($vector as $entry) {
                $total += $entry * $entry;
            }
            $total = sqrt($total);
    
            foreach ($vector as &$entry) {
                $entry = $entry / $total;
            }
    
            return $vector;
        }
    

        // function cosineSimilarity($vectorA, $vectorB) :: function to calculate cosine similarity between 2 vectors.
        private function cosineSimilarity($vectorA, $vectorB)
        {
            $dotProduct = $this->dotProduct($vectorA, $vectorB);
            $magnitudeA = $this->magnitude($vectorA);
            $magnitudeB = $this->magnitude($vectorB);

            return $dotProduct / ($magnitudeA * $magnitudeB);
        }

        // function dotProduct($vectorA, $vectorB) :: function to calculate dot product of 2 vectors.
        private function dotProduct($vectorA, $vectorB)
        {
            $product = 0;
            foreach ($vectorA as $word => $count) 
                if (isset($vectorB[$word])) 
                    $product += $count * $vectorB[$word];
            return $product;
        }

        // function magnitude($vector) :: function to calculate magnitude of a vector.
        private function magnitude($vector)
        {
            $magnitude = 0;
            foreach ($vector as $count)
                $magnitude += $count * $count;
            return sqrt($magnitude);
        }
                
    } // end class


?>

Advertise on this site

Copyright (c) Icontem 1999-2024

For more information send a message to info at phpclasses dot org.