Login   Register  
PHP Classes
elePHPant
Icontem

File: stemmer/spanish.php

Recommend this page to a friend!
Stumble It! Stumble It! Bookmark in del.icio.us Bookmark in del.icio.us
  Classes of Cesar D. Rodas  >  Guaranix Full Text  >  stemmer/spanish.php  >  Download  
File: stemmer/spanish.php
Role: Class source
Content type: text/plain
Description: Spanish Stemmer
Class: Guaranix Full Text
Index text documents for full text searching
Author: By
Last change: One little bug were fix by Diego Finol
Date: 2006-10-17 08:08
Size: 10,110 bytes
 

Contents

Class file image Download
<?
/*
    Creado por Cesar Rodas para el proyecto Saddor.com
    Este Stemmer esta basado en el argoritmo de Snowball Stemmer.
    saddor@gmail.com
    Este programa esta bajo licencia GNU
*/
if (!defined("SPANISHSTEMMER"))
{
    
define("vocal",1,false);
    
define("consonante",2,false);
    
define("SPANISHSTEMMER",1,false);
    
    class 
SpanishStemmer
    
{
        var 
$R1;
        var 
$R2;
        var 
$RV;
        var 
$word;
        function 
stem($word)
        {
            
            
$this->word $word;
            if (
strlen($word) < 2)    
                return;
            
    
            
$this->step_0();
            while(
$this->step_1());
            
$this->step_2();
            
$this->step_3();
            return 
$this->word;
        }
        
        function 
step_0()
        {
            
$this->splitword();
            
$search = array(
                
"me","se","sela","selo","selas","selos","la","le","lo","les",
                
"los","nos"
            
);
            
            
$prefix = array(
                
"iéndo","ándo","ár","ér","ír",     /* primer caso */
                
"iendo","ando","ar","er","ir",     /* segundo caso*/
                
"yendo"
            
);
            
            foreach (
$prefix as $id => $pref)
            {
                
$return false;    
                if ( (
strstr($this->RV,$pref) != NULL) or
                    
/* caso para yendo */
                    
($pref == "yendo" && strstr($this->word,"uyendo")) )
                {
                    
                    
/*
                        El prefijo fue encontrado, ahora buscar para borrar 
                        el pronombre.
                    */
                    
foreach ($search as  $word)
                    {
                        
$len strlen($word);
                        
                        switch (
$id)
                        {
                            
                            case 
$id 5:     /* primer Caso*/
                                
if ($word == substr($this->RV,-$len,$len) )
                                {
                                    
$this->word substr($this->word,0strlen($this->word) - $len);
                                    
$this->word str_replace($prefix[$id],$prefix[$id+5],$this->word);
                                    
$return true;
                                }
                                break;
                            case 
$id 10:     /* segundo caso*/
                                
if ($word == substr($this->RV,-$len,$len) )
                                {
                                    
$this->word substr($this->word,0strlen($this->word) - $len);
                                    
$return true;
                                }
                                break;
                            case 
$id >= 10:         /* tercer caso*/
                                
if ($word == substr($this->RV,-$len,$len) )
                                {
                                    
                                    
$this->word substr($this->word,0strlen($this->word) - $len);
                                    
$return true;
                                }
                                break;
                        }
                    }
                }
                
            }
            unset(
$prefix,$search,$word,$id,$pref,$len);
            return 
$return;
        }
        
        function 
step_1()
        {    
            
$return false;
            
$this->splitword();
            
            
/* borrado de R2 */
            
$search = array(
                
"abilidades","iblemente","icaciones","ablemente","antemente","ivamente","atamente",
                
"amientos","icadoras","icadores","icancias","imientos","icamente",
                
"osamente","abilidad","icidades","ividades","adamente","icantes",
                
"icancia","imiemto","icadora","icación","amiento","imiento","aciones",
                
"ativos","ativas","ividad","idades","icidad","icante",
                
"icador","adoras","adores","ancias","mente","ables",
                
"ismos","anzas","ativa","ativo","istas","ibles",
                
"ación","antes","adora","ancia","ismo","anza",
                
"icos","ivas","osos","ivos","ante","osas",
                
"ador","ible","ista","idad","able","ico",
                
"osa","oso","iva","ica","ica","ivo",
            );
            
            for (
$i 0$i count($search); $i++)
                if (
substr($this->R2,strlen($search[$i]) * (-1),strlen($search[$i])) == $search[$i])
                {
                    
$this->word substr($this->word,0,strlen($this->word) - strlen($search[$i]) );
                    
$return true;
                    break;
                }    
             
/* creo que esta mal, creo que hay que buscar en R1*/
             
if ($this->R1 == "amente")
             {
                
$this->word str_replace("amente","",$this->word);    
             }
            
            
$search = array
            (
                
"logía","logías",/**/"ución","uciones",/**/"encia","encias"
            
);
            
$replace = array
            (
                
"log","log","u","u","entre","entre"
            
);
            for (
$i 0$i count($search); $i++)
                if (
substr($this->R2,strlen($search[$i]) * (-1),strlen($search[$i])) == $search[$i])
                {
                    
$this->word str_replace($search[$i],$replace[$i],$this->word);
                    
$return true;
                    break;
                }
            unset(
$i,$search,$replace);
            return 
$return;
        }
        
        function 
step_2()
        {
            
$this->splitword();
            
$return false;
            
$search = array(
                
"ya","ye","yan","yen","yeron","yendo","yo","yó","yas","yes","yais","yamos"
            
);
            foreach (
$search as $word)
            {
                if (
substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
                    if (
substr($this->word,-1*(strlen($word) + 1), strlen($word) + 1) == "u".$word)
                    {
                        
$this->word =  substr($this->word,0strlen($this->word) -(strlen($word) + 1));
                        
$return true;
                    }
            }
            
            if (
$return == false)
                
$this->step_2b();
            unset(
$return,$search,$word);
        }
        
        function 
step_2b()
        {
            
$this->splitword();
            
$search = array(
                
"en","es","éis","emos"
            
);
            
            foreach (
$search as $word)
            {
                if (
substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
                    if (
substr($this->word,(-1)*(strlen($word) + 2), strlen($word) + 2) == "gu".$word)
                    {
                        
$this->word =  substr($this->word,0strlen($this->word) -(strlen($word) + 1) );
                        
$return true;
                    }
                    
/*
                        This part was fix by Diego Enrique Finol <dfinol at cantv dot net>
                        This was the email that Diego sent to me:
                            Epa saludos, gracias por la clase de spanish stemmer, había visto lo mismo
                            en snowball pero me ahorraste el trabajo de convertirlo a php. Sólo noté
                            que en las partes en la que había que borrar cierto sufijo y, además,
                            borrar la "u" de si está precedido por "gu" creo que no borra el sufijo si
                            no está precedido por esto. O sea, hay que borrar el afijo en ambos casos,
                            y de paso si está precedido por gu, también borrar la u, pero el algoritmo
                            sólo lo hace si está precedido por gu, sino, no borra nada.

                        Thanks Diego!. 
                    */
                    
else
                    {
                        
$this->word =  substr($this->word,0strlen($this->word) -(strlen($word)) );
                        
$return true;                    
                    }
                    
/*End of Diego fix*/
            
}
    
            
$search = array(
                
"iéramos","aríamos","iríamos","iésemos","eríamos","eríais","eremos",
                
"isteis","iríais","ierais","iremos","ábamos","ieseis",
                
"asteis","áramos","ásemos","aremos","aríais","abais",
                
"íamos","arais","ieses","arían","iesen","ieron",
                
"iendo","ieras","iréis","arías","erías","aseis",
                
"eréis","erían","irían","aréis","irías","ieran",
                
"ando","amos","aron","asen","aras","ados",
                
"íais","ases","imos","adas","idas","abas",
                
"iste","irán","erán","aría","ería","iera",
                
"irás","iría","aran","arás","erás","aste",
                
"iese","aban","arán","áis","ada","iré",
                
"ían","irá","eré","aba","ara","ido",
                
"aré","ará","ado","erá","ase","ías",
                
"ida","ía","er","ar","ió","an",
                
"ir","as","ad","ed","id","ís",
    
    
            );
    
            foreach (
$search as $word)
                if (
substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
                {
                    
$this->word substr($this->word,0strlen($this->word) -(strlen($word)));
                    
$this->splitword();
                }
            unset(
$search,$word);
    
        }
        
        function 
step_3()
        {
            
$this->splitword();
            
$return false;
            
$search = array( 
                
"os","a","o","á","í","ó"
            
);
            
            
            foreach (
$search as $word)
                if (
substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
                {
                    
$this->word substr($this->word,0strlen($this->word) -(strlen($word)));
                    
$return true;
                }
                
            
$search = array(
                
"e","é"
            
);
            
            foreach (
$search as $word)
            {
                if (
substr($this->RV,strlen($word) * (-1),strlen($word)) == $word)
                    if (
substr($this->RV,-1*(strlen($word) + 2), strlen($word) + 2) == "gu".$word)
                    {
                        
$this->word =  substr($this->word,0strlen($this->word) -(strlen($word) + 1) );
                        
$return true;
                    }
                    else
                    {
                        
$this->word =  substr($this->word,0strlen($this->word) -(strlen($word)) );
                        
$return true;                    
                    }
            }    
            unset(
$search,$word);
            
$this->word str_replace("á","a",$this->word);
            
$this->word str_replace("é","e",$this->word);
            
$this->word str_replace("í","i",$this->word);
            
$this->word str_replace("ó","o",$this->word);
            
$this->word str_replace("ú","u",$this->word);
            
$this->word str_replace("ü","u",$this->word);
            return 
$return;
        }
        
        
        
/* funciones utilizadas*/
        
function saddorsort($a$b
        {
           if (
strlen($a) == strlen($b)) {
               return 
0;
           }
           return (
strlen($a) < strlen($b)) ? : -1;
        }
        function 
splitword()
        {
            
$flag1=false;
            
$flag2=false;
            
$this->R1="";
            
$this->R2="";
            
$this->RV="";
            for (
$i 1$i strlen($this->word); $i++)
            {
                if (
$flag1)
                    
$this->R1.=$this->word[$i];
                if (
$flag2)
                    
$this->R2.=$this->word[$i];
                
                if (
$i+>= strlen($this->word))
                    break;
                    
                if (
$this->char_is($this->word[$i]) == consonante && 
                    
$this->char_is(@$this->word[$i+1]) == vocal &&
                    
$flag1 == true && $flag2 == false)
                    
$flag2=true;
                    
                if (
$this->char_is($this->word[$i]) == consonante && 
                    
$this->char_is($this->word[$i+1]) == vocal &&
                    
$flag1 == false
                    
$flag1=true;
            }
            
            
            
/* Buscando RV*/
            
$flag1=false;
            if (
$this->char_is($this->word[1]) == consonante)
            {
                for (
$i 2$i strlen($this->word); $i++)
                    if (
$this->char_is($this->word[$i]) == vocal)
                        break;
                
$i++;
                
$this->RV substr($this->word,$i); 
            }
            else if (
$this->char_is($this->word[1]) == vocal && $this->char_is($this->word[0]) == vocal)
            {
                for (
$i 2$i strlen($this->word); $i++)
                    if (
$this->char_is($this->word[$i]) == consonante)
                        break;
                
$i++;
                
$this->RV substr($this->word,$i);
            }
            else if (
strlen($this->word) > 2)
                
$this->RV substr($this->word,3); 
            
            unset(
$flag1,$flag2,$i);
        }
        
        function 
char_is($char)
        {
            
$char strtolower($char);
            if (
$char == "")
                return;
            
$vowel "aeiouáéíóúü";
            
$consonant "bcdfghijklmnñopqrsvtxwyz";
            if (
strstr($vowel,$char))
                return 
vocal;
            if (
strstr($consonant,$char))
                return 
consonante;    
        } 
    }
}
?>