<?php
/****************************************************************
* [Forker]
*
* Example: MapReduce example counts the appearance of each
word in a set of documents
* Usage : php examples/MapReduce/demo.map-reduce.php > test-mp
* Storage: FileStorage
****************************************************************/
require 'vendor/autoload.php';
use Forker\Forker;
use Forker\Storage\FileStorage;
$myResult = 0;
$myTasks = array(
'quijote-1.txt',
'quijote-2.txt',
'quijote-3.txt',
);
$numberOfSubTasks = 3;
$forker = new Forker(new FileStorage, $myTasks, $numberOfSubTasks);
$path = dirname(__FILE__);
// MAP
$forker->fork(function($key, $fileName, $emit) use($path){
$file_to_get = "$path/$fileName";
$content = file_get_contents($file_to_get);
foreach(getUTF8Words($content) as $word) {
$emit($word, 1);
}
});
// REDUCE
$mapped = $forker->fetch();
// We dont set here the number of sub tasks,
// since we don't know the total number
$forker = new Forker(new FileStorage('/tmp/reduced-words'), $mapped);
$forker->fork(function($word, $counts, $emit) {
$emit($word, is_array($counts) ? count($counts) : 1);
});
$allWords = $forker->fetch();
arsort($allWords, SORT_NUMERIC);
// First 10 words most used :)
$cont = 10;
foreach($allWords as $word => $counts) {
echo $word . " (". $counts .")\n";
if (! --$cont) break;
}
//////////////////////////////////////////////////////////
function getUTF8Words($text)
{
$match_arr = array();
//http://stackoverflow.com/questions/10684183/extract-words-from-string-with-preg-match-all
if(preg_match_all('/([a-zA-Z]|\xC3[\x80-\x96\x98-\xB6\xB8-\xBF]|\xC5[\x92\x93\xA0\xA1\xB8\xBD\xBE]){3,}/', $text, $match_arr)) {
return $match_arr[0];
}
return array();
}
|