Recommend this page to a friend! |
PDF Text Extractor | > | All threads | > | Unicode support | > | (Un) Subscribe thread alerts |
|
Artur Muszynski - 2015-08-09 17:10:32
1. Function getDirtyTexts() is completely broken. It does support only [] TJ, not () TJ. My implementation is below.
2. Bug in whitespaces in regexp - you should replace \s+ with \s* for most occurences. 3. getTextUsingTransformations() doesn't support non hex replacement characters. My implementation below. 4. getCharTransformations() is broken. There can be many translation tables for different fonts and this is not supported. function getDirtyTexts(&$texts, $textContainers) { $textContainers_count = count($textContainers); for ($j = 0; $j < $textContainers_count; $j++) { $t = $textContainers[$j]; $result = preg_match_all('#(\[(?P<arr>.*)\])|(?P<txt>\((.*)\))\s*TJ[\n|\r]#ismU', $t, $parts, PREG_SET_ORDER); if($result !== false) { $t = ""; foreach($parts as $part) { if(strlen($part["arr"]) > 0) { $t .= $part["arr"]; } elseif(strlen($part["txt"]) > 0) { $t .= $part["txt"]; } } $texts[] = $t; } } function getTextUsingTransformations($texts, $transformations) { $document = ""; for ($i = 0; $i < count($texts); $i++) { $t = $texts[$i]; $strlen_t = strlen($t); $isHex = false; $isPlain = false; $hex = ""; $plain = ""; for ($j = 0; $j < $strlen_t; $j++) { $c = $t[$j]; switch($c) { case "<": $hex = ""; $isHex = true; $isPlain = false; break; case ">": $hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO) $count_hexs = count($hexs); for ($k = 0; $k < $count_hexs; $k++) { $chex = str_pad($hexs[$k], 4, "0", STR_PAD_LEFT); // Add tailing zero if (isset($transformations[$chex])) { $chex = $transformations[$chex]; } $document .= html_entity_decode("&#x".$chex.";"); } $isHex = false; break; case "(": $plain = ""; $isPlain = true; $isHex = false; break; case ")": $document .= $plain; $isPlain = false; break; case "\\": $c2 = $texts[$i][$j + 1]; if (in_array($c2, array("\\", "(", ")"))) $plain .= $c2; elseif ($c2 == "n") $plain .= '\n'; elseif ($c2 == "r") $plain .= '\r'; elseif ($c2 == "t") $plain .= '\t'; elseif ($c2 == "b") $plain .= '\b'; elseif ($c2 == "f") $plain .= '\f'; elseif ($c2 >= '0' && $c2 <= '9') { $oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3)); $j += strlen($oct) - 1; $plain .= html_entity_decode("&#".octdec($oct).";", $this->convertquotes); } $j++; break; default: if($isHex) { $hex .= $c; } elseif($isPlain) { $chex = str_pad(ord($c), 4, "0", STR_PAD_LEFT); // Add tailing zero if(isset($transformations[$chex])) { $c = html_entity_decode("&#x{$transformations[$chex]};"); } $plain .= $c; } break; } } $document .= "\n"; } return $document; } |
info at phpclasses dot org
.