<?php
namespace App;
use Domain\ImgCountReport;
use Domain\Page;
use Domain\Report;
use Domain\Site;
use Infrastructure\Repository\PageRepository;
use InvalidArgumentException;
/**
* Class ImgCountHandler.
* Implementation of the recursive command for counting the number of tags <img />.
*
* @package App
*/
class ImgCountHandler
{
/** @var Page $rootPage */
protected $rootPage;
/** @var PageRepository $repository */
protected $repository;
/** @var Site $site */
protected $site;
/** @var int $maxDepth The maximum depth of recursion when processing site pages. */
protected $maxDepth;
/**
* @var ContentLoaderInterface
*/
private $contentLoader;
/**
* ImgCountHandler constructor.
*
* @param Site $site Site information.
* @param string $rootUrl Root URL for begin processing.
* @param ContentLoaderInterface $loader Content loader.
* @param array $headers CURL headers for content load.
* @param int $maxDepth The maximum depth of recursion when processing site pages.
*/
public function __construct(Site $site, string $rootUrl, ContentLoaderInterface $loader, array $headers = [],
int $maxDepth = PHP_INT_MAX)
{
$this->repository = new PageRepository();
$this->repository->store($this->rootPage = new Page($site->correctUrl($rootUrl)));
$this->maxDepth = $maxDepth;
$this->site = $site;
$this->contentLoader = $loader;
$loader->setHeaders($headers);
}
/**
* @param string $url
*
* @return Report
*/
public function handle(string $url): Report
{
$this->pageProcessingRecursive([$url]);
return new ImgCountReport($this->repository);
}
private function countImgTags(string &$content): int
{
preg_match_all('/<img(?>\\s|$)/i', $content, $matches);
return count($matches[0] ?? []);
}
private function pageProcessing(Page $page, string &$content): void
{
if (($childrenUrls = $this->correctUrls(UrlFilter::getInstance()->handle($content))) === null) {
$page->setChildren([])->setImgCount(0);
$this->echoErrorMsg($page);
}
$children = [];
/** @var string $url */
foreach ($childrenUrls as $url) {
$children[] =
$childrenPage = $this->repository->get($url) ?? new Page($url);
$this->repository->store($childrenPage);
}
$page->setChildren($children)
->setImgCount($this->countImgTags($content));
}
private function echoErrorMsg(Page $page): void
{
switch (preg_last_error()) {
case PREG_NO_ERROR:
$errorMsg = '?????? ???????????.';
break;
case PREG_INTERNAL_ERROR:
$errorMsg = '????????? ?????????? ?????? PCRE.';
break;
case PREG_BACKTRACK_LIMIT_ERROR:
$errorMsg = '????? ???????? ?????? ??? ????????.';
break;
case PREG_RECURSION_LIMIT_ERROR:
$errorMsg = '????? ???????? ??? ????????.';
break;
case PREG_BAD_UTF8_ERROR:
$errorMsg = '?????? ???? ??????? ????????????? ??????? UTF-8 (?????? ??? ??????? ? ?????? UTF-8).';
break;
case PREG_BAD_UTF8_OFFSET_ERROR:
$errorMsg =
'???????? ?? ????????????? ?????? ?????????? ??????? ????? UTF-8 (?????? ??? ??????? ? ?????? UTF-8).';
break;
case PREG_JIT_STACKLIMIT_ERROR:
$errorMsg = '????????? ??????? PCRE ??????????? ???????? ??-?? ?????? ????? JIT.';
break;
default:
$errorMsg = '??????????? ?????? PCRE.';
}
echo "\nContent parsing error for URL \"", $page->getUrl(), '": ', $errorMsg, "\n";
}
private function correctUrls(?array $urlList): ?array
{
if ($urlList === null) {
return null;
}
foreach ($urlList as $i => $url) {
if ($this->site->isInhere($url)) {
try {
$correctedUrl = $this->site->correctUrl($url);
if (($this->repository->get($correctedUrl) ?? new Page($correctedUrl))->isNotProcessed()) {
$urlList[$i] = $correctedUrl;
} else {
unset($urlList[$i]);
}
} catch (InvalidArgumentException $e) {
unset($urlList[$i]);
}
} else {
unset($urlList[$i]);
}
}
return array_values($urlList);
}
private function pageProcessingRecursive(array $urlList, int $depth = 1): void
{
$start = microtime(true);
$urlList = $this->correctUrls($urlList) ?? [];
$contentArray = $this->contentLoader->loadContent($urlList);
$loadTime = microtime(true) - $start;
foreach ($contentArray as $url => $content) {
$start = microtime(true);
$page = $this->repository->get($url) ?? new Page($url);
if ($page->isNotProcessed()) {
$this->repository->store($page);
$this->pageProcessing($page, $content);
$page->setProcessingTime(microtime(true) - $start + $loadTime);
}
unset($contentArray[$url]);
}
/** Check max depth level */
if ($this->maxDepth <= ++$depth) {
return;
}
foreach ($urlList as $url) {
$page = $this->repository->get($url);
$children = $page->getChildren();
foreach ($children as $i => $page) {
if ($page->isNotProcessed()) {
$children[$i] = $page->getUrl();
} else {
unset($children[$i]);
}
}
$this->pageProcessingRecursive($children, $depth);
}
}
}
|