<?php
/*
wText - This class analizes text, related to:
Out: - Spammyfactor in percent
- TextWeight in percent
In: - Text to analize in constructor
- Stopwords in constructor
- Spamwords with positive(spam) or negative(interesting content) values in constructor
- optional config array in detect method
Required: List of stopwords ans list of spammwords or phrases in the desired language
Version - 1.0.0
Author - Till Wehowski
License - Do What The Fuck You Want To Pulic License
I would by happy about a backlink to my homepage (Webfan.de) but its not an obligation.
More - http://phpclasses.org
http://www.webfan.de (Author Homepage)
Docu - Please read the code
count: Number of words of original text
count_clean: Number of words without stopwords
count_spammy: Number of words without stopwords and spamwords
value: count_clean in percent
spammy: count_spammy in percent
spammyPoints: spam points regarding weighted spamwords
spammyValue: spammyPoints / count * factor
Example:
$txt = '';
foreach($_POST as $k => $p)
{
$p = trim($p);
$p = (string)$p;
$txt .= $p;
$txt .= ' ';
}
$s = new wText($txt, $stopwords, $spamwords, TRUE );
$r = $s->calc();
$isSpam = $s->detect();
$html = '';
//var_dump($txt, $s->result);
$html.='Testergebnis:<br />';
$html.='Spam? ';
if($isSpam === FALSE)
{
$html.='<span style="color:green;">NEIN</span>';
}else{
$html.='<span style="color:red;">JA</span>';
}
$html.='<ul>';
foreach($r as $name => $value)
{
$html.='<li>';
$html.='<b>'.$name.'</b>: '.$value;
$html.='</li>';
}
$html.='</ul>';
$r = $s->getBuf();
$html.='Datengrundlage:';
$html.='<ul>';
foreach($r as $name => $value)
{
$html.='<li>';
$html.='<b>'.strip_tags($name).'</b>: '.strip_tags($value);
$html.='</li>';
}
$html.='</ul>';
echo $html;
*/
class wText
{
public $factor;
public $result;
private $txt;
private $buf;
private static $stopwords;
private static $spamwords;
private static $noise;
private $lower;
/*
$txt - string/text
$stopwords - array of array( 'word' => 'theWord', 'lang' => 'de')
$spamwords - array of array( 'word' => 'theWord', 'lang' => 'en', 'value' => 0.00)
@returns $this
*/
public function __construct($text, $stopwords = array(), $spamwords = array(), $lowercase = TRUE, $factor = 100 )
{
$this->factor = $factor;
if(is_array($stopwords) && count($stopwords) > 0)
{
$this->stopwords = $stopwords;
}
if(is_array($spamwords) && count($spamwords) > 0)
{
$this->spamwords = $spamwords;
}
$this->lower = $lowercase;
$this->txt = ' '.$this->html2txt($text).' ';
$this->buf = array();
$this->result = array();
$this->noise = array('.', ':', ';', ',', '!', '?', '-', '_', '+', '=','~','`','*','&','^','%', '(', ')','{','}', "'", '"', '\\', '/','|', '[', ']', '#','$', '€', '@', '&', ' ');
return $this;
}
//eof constructor
public function calc()
{
$this->doCalc();
return $this->result;
}
//eof calc
/*
Example detect
*/
public function detect($conf = array(
'APP' => array('online' => 1, 'factor' => 100),
'SPAM' => array('min_count' => 0,
'min_count_clean' => 0,
'max_count_spammy' => 10,
'min_value' => 40,
'max_spammy' => 20,
'max_spammyPoints' => 20,
'max_spammyValue' => 40
) ) )
{
$isSpam = FALSE;
if($conf['SPAM']['min_count'] !== 0 && $this->result['count'] < $conf['SPAM']['min_count']) return TRUE;
if($conf['SPAM']['min_count_clean'] !== 0 && $this->result['count_clean'] < $conf['SPAM']['min_count_clean']) return TRUE;
if($conf['SPAM']['max_count_spammy'] !== 0 && $this->result['count_spammy'] > $conf['SPAM']['max_count_spammy']) return TRUE;
if($conf['SPAM']['min_value'] !== 0 && $this->result['value'] < $conf['SPAM']['min_value']) return TRUE;
if($conf['SPAM']['max_spammy'] !== 0 && $this->result['spammy'] > $conf['SPAM']['max_spammy']) return TRUE;
if($conf['SPAM']['max_spammyPoints'] !== 0 && $this->result['spammyPoints'] > $conf['SPAM']['max_spammyPoints']) return TRUE;
if($conf['SPAM']['max_spammyValue'] !== 0 && $this->result['spammyValue'] > $conf['SPAM']['max_spammyValue']) return TRUE;
return $isSpam;
}
//eof detect
private function doCalc()
{
$this->result['count'] = 0;
$this->result['count_clean'] = 0;
$this->result['count_spammy'] = 0;
$this->result['value'] = 0;
$this->result['spammy'] = 0;
$this->result['spammyPoints'] = 0;
$this->result['spammyValue'] = 0;
$this->buf['content'] = strip_tags($this->txt);
if($this->lower !== FALSE)$this->buf['content'] = strtolower($this->buf['content']);
$replace = array();
for($i = 0; $i <= count($this->noise) -1; $i++)
{
$replace[] = ' ';
}
$this->buf['content'] = str_replace($this->noise, $replace, $this->buf['content']);
$this->result['count'] = str_word_count($this->buf['content'], 0);
$this->buf['clean'] = $this->buf['content'];
foreach($this->stopwords as $k => $w)
{
$w['word'] = trim($w['word']);
// $this->buf['clean'] = str_ireplace($w['word'], ' ', $this->buf['clean']);
$this->buf['clean'] = preg_replace("/\s".preg_quote($w['word'])."\b/i", ' ', $this->buf['clean']);
}
$this->buf['clean'] = preg_replace("/\s+/", ' ', $this->buf['clean']);
$this->result['count_clean'] = str_word_count($this->buf['clean'], 0);
$this->buf['clean_spammy'] = $this->buf['clean'];
foreach($this->spamwords as $k => $w)
{
$w['word'] = trim($w['word']);
// $this->buf['clean_spammy'] = str_ireplace($w['word'], ' ', $this->buf['clean_spammy']);
$this->buf['clean_spammy'] = preg_replace("/\s".preg_quote($w['word'])."\b/i", ' ', $this->buf['clean_spammy']);
$c = substr_count($this->buf['content'], ' '.$w['word'].' ');
$this->result['spammyPoints'] = $this->result['spammyPoints'] + ( $c * $w['value'] );
$this->result['count_spammy'] = $this->result['count_spammy'] + $c;
}
$this->buf['clean_spammy'] = preg_replace("/\s+/", ' ', $this->buf['clean_spammy']);
$this->result['value'] = (str_word_count($this->buf['clean']) / str_word_count($this->buf['content']) ) * 100;
$this->result['value'] = round($this->result['value'] , 2);
$this->result['spammy'] = 100 - ( (str_word_count($this->buf['clean_spammy']) / str_word_count($this->buf['clean']) ) * 100);
$this->result['spammy'] = round($this->result['spammy'], 2);
$this->result['spammyValue'] = $this->result['spammyPoints'] / $this->result['count'];
$this->result['spammyValue'] = round($this->result['spammyValue'] * $this->factor, 2);
}
//eo prepare
public function getBuf()
{
return $this->buf;
}
function html2txt($document){
$search = array('@<script[^>]*?>.*?</script>@si', // Strip out javascript
'@<[\/\!]*?[^<>]*?>@si', // Strip out HTML tags
'@<style[^>]*?>.*?</style>@siU', // Strip style tags properly
'@<![\s\S]*?--[ \t\n\r]*>@' // Strip multi-line comments including CDATA
);
$text = preg_replace($search, '', $document);
$text = strip_tags($text);
return $text;
}
}
//EOF
|