758 lines
14 KiB
PHP
758 lines
14 KiB
PHP
|
|
<?php
|
||
|
|
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Simple PDF to Text class.
|
||
|
|
*
|
||
|
|
* @license GNU General Public License version 2 or later;
|
||
|
|
*/
|
||
|
|
|
||
|
|
namespace Asika;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Simple PDF to Text class.
|
||
|
|
* This is a free software and baseed on SilverStripe "class.pdf2text.php"
|
||
|
|
*
|
||
|
|
* @see https://code.google.com/p/lucene-silverstripe-plugin/source/browse/trunk/thirdparty/class.pdf2text.php?r=19
|
||
|
|
*/
|
||
|
|
class Pdf2text
|
||
|
|
{
|
||
|
|
/**
|
||
|
|
* Use setUnicode(TRUE|FALSE)
|
||
|
|
*
|
||
|
|
* @var int
|
||
|
|
*/
|
||
|
|
protected $multibyte = 4;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* ENT_COMPAT (double-quotes), ENT_QUOTES (Both), ENT_NOQUOTES (None)
|
||
|
|
*
|
||
|
|
* @var int
|
||
|
|
*/
|
||
|
|
protected $convertquotes = ENT_QUOTES;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* TRUE if you have problems with time-out
|
||
|
|
*
|
||
|
|
* @var bool
|
||
|
|
*/
|
||
|
|
protected $showprogress = false;
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Property filename.
|
||
|
|
*
|
||
|
|
* @var string
|
||
|
|
*/
|
||
|
|
protected $filename = '';
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Property decodedtext.
|
||
|
|
*
|
||
|
|
* @var string
|
||
|
|
*/
|
||
|
|
protected $decodedtext = '';
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Set file name.
|
||
|
|
*
|
||
|
|
* @param string $filename
|
||
|
|
*
|
||
|
|
* @return void
|
||
|
|
*/
|
||
|
|
public function setFilename($filename)
|
||
|
|
{
|
||
|
|
// Reset
|
||
|
|
$this->decodedtext = '';
|
||
|
|
$this->filename = $filename;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Get output text.
|
||
|
|
*
|
||
|
|
* @param boolean $echo True to echo it.
|
||
|
|
*
|
||
|
|
* @return string
|
||
|
|
*/
|
||
|
|
public function output($echo = false)
|
||
|
|
{
|
||
|
|
if ($echo)
|
||
|
|
{
|
||
|
|
echo $this->decodedtext;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
return $this->decodedtext;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Using unicode.
|
||
|
|
*
|
||
|
|
* @param boolean $input True or not to use unicode.
|
||
|
|
*
|
||
|
|
* @return void
|
||
|
|
*/
|
||
|
|
public function setUnicode($input)
|
||
|
|
{
|
||
|
|
// 4 for unicode. But 2 should work in most cases just fine
|
||
|
|
if ($input == true)
|
||
|
|
{
|
||
|
|
$this->multibyte = 4;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
$this->multibyte = 2;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Decode PDF
|
||
|
|
*
|
||
|
|
* @return string
|
||
|
|
*/
|
||
|
|
public function decodePDF()
|
||
|
|
{
|
||
|
|
// Read the data from pdf file
|
||
|
|
$infile = @file_get_contents($this->filename, FILE_BINARY);
|
||
|
|
if (empty($infile))
|
||
|
|
{
|
||
|
|
return "";
|
||
|
|
}
|
||
|
|
|
||
|
|
// Get all text data.
|
||
|
|
$transformations = array();
|
||
|
|
$texts = array();
|
||
|
|
|
||
|
|
// Get the list of all objects.
|
||
|
|
preg_match_all("#obj[\n|\r](.*)endobj[\n|\r]#ismU", $infile . "endobj\r", $objects);
|
||
|
|
$objects = @$objects[1];
|
||
|
|
|
||
|
|
// Select objects with streams.
|
||
|
|
for ($i = 0; $i < count($objects); $i++)
|
||
|
|
{
|
||
|
|
$currentObject = $objects[$i];
|
||
|
|
|
||
|
|
// Prevent time-out
|
||
|
|
@set_time_limit(0);
|
||
|
|
|
||
|
|
if ($this->showprogress)
|
||
|
|
{
|
||
|
|
flush();
|
||
|
|
ob_flush();
|
||
|
|
}
|
||
|
|
|
||
|
|
// Check if an object includes data stream.
|
||
|
|
if (preg_match("#stream[\n|\r](.*)endstream[\n|\r]#ismU", $currentObject . "endstream\r", $stream))
|
||
|
|
{
|
||
|
|
$stream = ltrim($stream[1]);
|
||
|
|
|
||
|
|
// Check object parameters and look for text data.
|
||
|
|
$options = $this->getObjectOptions($currentObject);
|
||
|
|
|
||
|
|
if (!(empty($options["Length1"]) && empty($options["Type"]) && empty($options["Subtype"])))
|
||
|
|
{
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Hack, length doesnt always seem to be correct
|
||
|
|
unset($options["Length"]);
|
||
|
|
|
||
|
|
// So, we have text data. Decode it.
|
||
|
|
$data = $this->getDecodedStream($stream, $options);
|
||
|
|
|
||
|
|
if (strlen($data))
|
||
|
|
{
|
||
|
|
if (preg_match_all("#BT[\n|\r| ](.*)ET[\n|\r| ]#ismU", $data . "ET\r", $textContainers))
|
||
|
|
{
|
||
|
|
$textContainers = @$textContainers[1];
|
||
|
|
$this->getDirtyTexts($texts, $textContainers);
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
$this->getCharTransformations($transformations, $data);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
// Analyze text blocks taking into account character transformations and return results.
|
||
|
|
$this->decodedtext = $this->getTextUsingTransformations($texts, $transformations);
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Decode ASCII Hex.
|
||
|
|
*
|
||
|
|
* @param string $input ASCII string.
|
||
|
|
*
|
||
|
|
* @return string
|
||
|
|
*/
|
||
|
|
public function decodeAsciiHex($input)
|
||
|
|
{
|
||
|
|
$output = "";
|
||
|
|
$isOdd = true;
|
||
|
|
$isComment = false;
|
||
|
|
|
||
|
|
for ($i = 0, $codeHigh = -1; $i < strlen($input) && $input[$i] != '>'; $i++)
|
||
|
|
{
|
||
|
|
$c = $input[$i];
|
||
|
|
|
||
|
|
if ($isComment)
|
||
|
|
{
|
||
|
|
if ($c == '\r' || $c == '\n')
|
||
|
|
{
|
||
|
|
$isComment = false;
|
||
|
|
}
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
switch ($c)
|
||
|
|
{
|
||
|
|
case '\0':
|
||
|
|
case '\t':
|
||
|
|
case '\r':
|
||
|
|
case '\f':
|
||
|
|
case '\n':
|
||
|
|
case ' ':
|
||
|
|
break;
|
||
|
|
case '%':
|
||
|
|
$isComment = true;
|
||
|
|
break;
|
||
|
|
|
||
|
|
default:
|
||
|
|
$code = hexdec($c);
|
||
|
|
if ($code === 0 && $c != '0')
|
||
|
|
{
|
||
|
|
return "";
|
||
|
|
}
|
||
|
|
|
||
|
|
if ($isOdd)
|
||
|
|
{
|
||
|
|
$codeHigh = $code;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
$output .= chr($codeHigh * 16 + $code);
|
||
|
|
}
|
||
|
|
|
||
|
|
$isOdd = !$isOdd;
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if ($input[$i] != '>')
|
||
|
|
{
|
||
|
|
return "";
|
||
|
|
}
|
||
|
|
|
||
|
|
if ($isOdd)
|
||
|
|
{
|
||
|
|
$output .= chr($codeHigh * 16);
|
||
|
|
}
|
||
|
|
|
||
|
|
return $output;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Descode ASCII 85.
|
||
|
|
*
|
||
|
|
* @param string $input ASCII string.
|
||
|
|
*
|
||
|
|
* @return string
|
||
|
|
*/
|
||
|
|
public function decodeAscii85($input)
|
||
|
|
{
|
||
|
|
$output = "";
|
||
|
|
|
||
|
|
$isComment = false;
|
||
|
|
$ords = array();
|
||
|
|
|
||
|
|
for ($i = 0, $state = 0; $i < strlen($input) && $input[$i] != '~'; $i++)
|
||
|
|
{
|
||
|
|
$c = $input[$i];
|
||
|
|
|
||
|
|
if ($isComment)
|
||
|
|
{
|
||
|
|
if ($c == '\r' || $c == '\n')
|
||
|
|
{
|
||
|
|
$isComment = false;
|
||
|
|
}
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
if ($c == '\0' || $c == '\t' || $c == '\r' || $c == '\f' || $c == '\n' || $c == ' ')
|
||
|
|
{
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
if ($c == '%')
|
||
|
|
{
|
||
|
|
$isComment = true;
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
if ($c == 'z' && $state === 0)
|
||
|
|
{
|
||
|
|
$output .= str_repeat(chr(0), 4);
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
if ($c < '!' || $c > 'u')
|
||
|
|
{
|
||
|
|
return "";
|
||
|
|
}
|
||
|
|
|
||
|
|
$code = ord($input[$i]) & 0xff;
|
||
|
|
$ords[$state++] = $code - ord('!');
|
||
|
|
|
||
|
|
if ($state == 5)
|
||
|
|
{
|
||
|
|
$state = 0;
|
||
|
|
for ($sum = 0, $j = 0; $j < 5; $j++)
|
||
|
|
$sum = $sum * 85 + $ords[$j];
|
||
|
|
for ($j = 3; $j >= 0; $j--)
|
||
|
|
$output .= chr($sum >> ($j * 8));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
if ($state === 1)
|
||
|
|
{
|
||
|
|
return "";
|
||
|
|
}
|
||
|
|
elseif ($state > 1)
|
||
|
|
{
|
||
|
|
for ($i = 0, $sum = 0; $i < $state; $i++)
|
||
|
|
$sum += ($ords[$i] + ($i == $state - 1)) * pow(85, 4 - $i);
|
||
|
|
for ($i = 0; $i < $state - 1; $i++)
|
||
|
|
{
|
||
|
|
try
|
||
|
|
{
|
||
|
|
if (false == ($o = chr($sum >> ((3 - $i) * 8))))
|
||
|
|
{
|
||
|
|
throw new \Exception('Error');
|
||
|
|
}
|
||
|
|
$output .= $o;
|
||
|
|
} catch (\Exception $e)
|
||
|
|
{ /*Dont do anything*/
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
return $output;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Decode Flate
|
||
|
|
*
|
||
|
|
* @param $data
|
||
|
|
*
|
||
|
|
* @return string
|
||
|
|
*/
|
||
|
|
public function decodeFlate($data)
|
||
|
|
{
|
||
|
|
return @gzuncompress($data);
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Get Object Options
|
||
|
|
*
|
||
|
|
* @param $object
|
||
|
|
*
|
||
|
|
* @return array
|
||
|
|
*/
|
||
|
|
public function getObjectOptions($object)
|
||
|
|
{
|
||
|
|
$options = array();
|
||
|
|
|
||
|
|
if (preg_match("#<<(.*)>>#ismU", $object, $options))
|
||
|
|
{
|
||
|
|
$options = explode("/", $options[1]);
|
||
|
|
|
||
|
|
@array_shift($options);
|
||
|
|
|
||
|
|
$o = array();
|
||
|
|
|
||
|
|
for ($j = 0; $j < @count($options); $j++)
|
||
|
|
{
|
||
|
|
$options[$j] = preg_replace("#\s+#", " ", trim($options[$j]));
|
||
|
|
|
||
|
|
if (strpos($options[$j], " ") !== false)
|
||
|
|
{
|
||
|
|
$parts = explode(" ", $options[$j]);
|
||
|
|
$o[$parts[0]] = $parts[1];
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
$o[$options[$j]] = true;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
$options = $o;
|
||
|
|
|
||
|
|
unset($o);
|
||
|
|
}
|
||
|
|
|
||
|
|
return $options;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Get Decode Stream.
|
||
|
|
*
|
||
|
|
* @param $stream
|
||
|
|
* @param $options
|
||
|
|
*
|
||
|
|
* @return string
|
||
|
|
*/
|
||
|
|
public function getDecodedStream($stream, $options)
|
||
|
|
{
|
||
|
|
$data = "";
|
||
|
|
|
||
|
|
if (empty($options["Filter"]))
|
||
|
|
{
|
||
|
|
$data = $stream;
|
||
|
|
}
|
||
|
|
else
|
||
|
|
{
|
||
|
|
$length = !empty($options["Length"]) ? $options["Length"] : strlen($stream);
|
||
|
|
$_stream = substr($stream, 0, $length);
|
||
|
|
|
||
|
|
foreach ($options as $key => $value)
|
||
|
|
{
|
||
|
|
if ($key == "ASCIIHexDecode")
|
||
|
|
{
|
||
|
|
$_stream = $this->decodeAsciiHex($_stream);
|
||
|
|
}
|
||
|
|
elseif ($key == "ASCII85Decode")
|
||
|
|
{
|
||
|
|
$_stream = $this->decodeAscii85($_stream);
|
||
|
|
}
|
||
|
|
elseif ($key == "FlateDecode")
|
||
|
|
{
|
||
|
|
$_stream = $this->decodeFlate($_stream);
|
||
|
|
}
|
||
|
|
elseif ($key == "Crypt")
|
||
|
|
{ // TO DO
|
||
|
|
}
|
||
|
|
}
|
||
|
|
$data = $_stream;
|
||
|
|
}
|
||
|
|
|
||
|
|
return $data;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Get Dirty Texts
|
||
|
|
*
|
||
|
|
* @param array $texts
|
||
|
|
* @param array $textContainers
|
||
|
|
*
|
||
|
|
* @return void
|
||
|
|
*/
|
||
|
|
public function getDirtyTexts(&$texts, $textContainers)
|
||
|
|
{
|
||
|
|
for ($j = 0; $j < count($textContainers); $j++)
|
||
|
|
{
|
||
|
|
if (preg_match_all("#\[(.*)\]\s*TJ[\n|\r| ]#ismU", $textContainers[$j], $parts))
|
||
|
|
{
|
||
|
|
$texts = array_merge($texts, array(@implode('', $parts[1])));
|
||
|
|
}
|
||
|
|
elseif (preg_match_all("#T[d|w|m|f]\s*(\(.*\))\s*Tj[\n|\r| ]#ismU", $textContainers[$j], $parts))
|
||
|
|
{
|
||
|
|
$texts = array_merge($texts, array(@implode('', $parts[1])));
|
||
|
|
}
|
||
|
|
elseif (preg_match_all("#T[d|w|m|f]\s*(\[.*\])\s*Tj[\n|\r| ]#ismU", $textContainers[$j], $parts))
|
||
|
|
{
|
||
|
|
$texts = array_merge($texts, array(@implode('', $parts[1])));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Get Char Transformations
|
||
|
|
*
|
||
|
|
* @param $transformations
|
||
|
|
* @param $stream
|
||
|
|
*
|
||
|
|
* @return void
|
||
|
|
*/
|
||
|
|
public function getCharTransformations(&$transformations, $stream)
|
||
|
|
{
|
||
|
|
preg_match_all("#([0-9]+)\s+beginbfchar(.*)endbfchar#ismU", $stream, $chars, PREG_SET_ORDER);
|
||
|
|
preg_match_all("#([0-9]+)\s+beginbfrange(.*)endbfrange#ismU", $stream, $ranges, PREG_SET_ORDER);
|
||
|
|
|
||
|
|
for ($j = 0; $j < count($chars); $j++)
|
||
|
|
{
|
||
|
|
$count = $chars[$j][1];
|
||
|
|
$current = explode("\n", trim($chars[$j][2]));
|
||
|
|
|
||
|
|
for ($k = 0; $k < $count && $k < count($current); $k++)
|
||
|
|
{
|
||
|
|
if (preg_match("#<([0-9a-f]{2,4})>\s+<([0-9a-f]{4,512})>#is", trim($current[$k]), $map))
|
||
|
|
{
|
||
|
|
$transformations[str_pad($map[1], 4, "0")] = $map[2];
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
for ($j = 0; $j < count($ranges); $j++)
|
||
|
|
{
|
||
|
|
$count = $ranges[$j][1];
|
||
|
|
$current = explode("\n", trim($ranges[$j][2]));
|
||
|
|
|
||
|
|
for ($k = 0; $k < $count && $k < count($current); $k++)
|
||
|
|
{
|
||
|
|
if (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+<([0-9a-f]{4})>#is", trim($current[$k]), $map))
|
||
|
|
{
|
||
|
|
$from = hexdec($map[1]);
|
||
|
|
$to = hexdec($map[2]);
|
||
|
|
$_from = hexdec($map[3]);
|
||
|
|
|
||
|
|
for ($m = $from, $n = 0; $m <= $to; $m++, $n++)
|
||
|
|
{
|
||
|
|
$transformations[sprintf("%04X", $m)] = sprintf("%04X", $_from + $n);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
elseif (preg_match("#<([0-9a-f]{4})>\s+<([0-9a-f]{4})>\s+\[(.*)\]#ismU", trim($current[$k]), $map))
|
||
|
|
{
|
||
|
|
$from = hexdec($map[1]);
|
||
|
|
$to = hexdec($map[2]);
|
||
|
|
$parts = preg_split("#\s+#", trim($map[3]));
|
||
|
|
|
||
|
|
for ($m = $from, $n = 0; $m <= $to && $n < count($parts); $m++, $n++)
|
||
|
|
{
|
||
|
|
$transformations[sprintf("%04X", $m)] = sprintf("%04X", hexdec($parts[$n]));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Get Text Using Transformations
|
||
|
|
*
|
||
|
|
* @param $texts
|
||
|
|
* @param $transformations
|
||
|
|
*
|
||
|
|
* @return string
|
||
|
|
*/
|
||
|
|
public function getTextUsingTransformations($texts, $transformations)
|
||
|
|
{
|
||
|
|
$document = "";
|
||
|
|
|
||
|
|
for ($i = 0; $i < count($texts); $i++)
|
||
|
|
{
|
||
|
|
$isHex = false;
|
||
|
|
$isPlain = false;
|
||
|
|
|
||
|
|
$hex = "";
|
||
|
|
$plain = "";
|
||
|
|
|
||
|
|
for ($j = 0; $j < strlen($texts[$i]); $j++)
|
||
|
|
{
|
||
|
|
$c = $texts[$i][$j];
|
||
|
|
switch ($c)
|
||
|
|
{
|
||
|
|
case "<":
|
||
|
|
$hex = "";
|
||
|
|
$isHex = true;
|
||
|
|
$isPlain = false;
|
||
|
|
break;
|
||
|
|
case ">":
|
||
|
|
$hexs = str_split($hex, $this->multibyte); // 2 or 4 (UTF8 or ISO)
|
||
|
|
for ($k = 0; $k < count($hexs); $k++)
|
||
|
|
{
|
||
|
|
|
||
|
|
$chex = str_pad($hexs[$k], 4, "0"); // Add tailing zero
|
||
|
|
if (isset($transformations[$chex]))
|
||
|
|
{
|
||
|
|
$chex = $transformations[$chex];
|
||
|
|
}
|
||
|
|
$document .= html_entity_decode("&#x" . $chex . ";");
|
||
|
|
}
|
||
|
|
$isHex = false;
|
||
|
|
break;
|
||
|
|
case "(":
|
||
|
|
$plain = "";
|
||
|
|
$isPlain = true;
|
||
|
|
$isHex = false;
|
||
|
|
break;
|
||
|
|
case ")":
|
||
|
|
$document .= $plain;
|
||
|
|
$isPlain = false;
|
||
|
|
break;
|
||
|
|
case "\\":
|
||
|
|
$c2 = $texts[$i][$j + 1];
|
||
|
|
if (in_array($c2, array("\\", "(", ")")))
|
||
|
|
{
|
||
|
|
$plain .= $c2;
|
||
|
|
}
|
||
|
|
elseif ($c2 == "n")
|
||
|
|
{
|
||
|
|
$plain .= '\n';
|
||
|
|
}
|
||
|
|
elseif ($c2 == "r")
|
||
|
|
{
|
||
|
|
$plain .= '\r';
|
||
|
|
}
|
||
|
|
elseif ($c2 == "t")
|
||
|
|
{
|
||
|
|
$plain .= '\t';
|
||
|
|
}
|
||
|
|
elseif ($c2 == "b")
|
||
|
|
{
|
||
|
|
$plain .= '\b';
|
||
|
|
}
|
||
|
|
elseif ($c2 == "f")
|
||
|
|
{
|
||
|
|
$plain .= '\f';
|
||
|
|
}
|
||
|
|
elseif ($c2 >= '0' && $c2 <= '9')
|
||
|
|
{
|
||
|
|
$oct = preg_replace("#[^0-9]#", "", substr($texts[$i], $j + 1, 3));
|
||
|
|
$j += strlen($oct) - 1;
|
||
|
|
$plain .= html_entity_decode("&#" . octdec($oct) . ";", $this->convertquotes);
|
||
|
|
}
|
||
|
|
$j++;
|
||
|
|
break;
|
||
|
|
|
||
|
|
default:
|
||
|
|
if ($isHex)
|
||
|
|
{
|
||
|
|
$hex .= $c;
|
||
|
|
}
|
||
|
|
elseif ($isPlain)
|
||
|
|
{
|
||
|
|
$plain .= $c;
|
||
|
|
}
|
||
|
|
break;
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
$document .= "\n";
|
||
|
|
}
|
||
|
|
|
||
|
|
return $document;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Method to set property showprogress
|
||
|
|
*
|
||
|
|
* @param boolean $showprogress
|
||
|
|
*
|
||
|
|
* @return static Return self to support chaining.
|
||
|
|
*/
|
||
|
|
public function showProgress($showprogress)
|
||
|
|
{
|
||
|
|
$this->showprogress = $showprogress;
|
||
|
|
|
||
|
|
return $this;
|
||
|
|
}
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Method to set property convertquotes
|
||
|
|
*
|
||
|
|
* @param int $convertquotes
|
||
|
|
*
|
||
|
|
* @return static Return self to support chaining.
|
||
|
|
*/
|
||
|
|
public function convertQuotes($convertquotes)
|
||
|
|
{
|
||
|
|
$this->convertquotes = $convertquotes;
|
||
|
|
|
||
|
|
return $this;
|
||
|
|
}
|
||
|
|
|
||
|
|
public function saveContentToFile($file){
|
||
|
|
file_put_contents($file,$this->decodedtext);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
class OrderReader{
|
||
|
|
|
||
|
|
public $filename;
|
||
|
|
public $orders;
|
||
|
|
public $currentOrder;
|
||
|
|
public $currentLine;
|
||
|
|
|
||
|
|
private $handle;
|
||
|
|
|
||
|
|
public function setFilename($file){
|
||
|
|
$this->filename=$file;
|
||
|
|
}
|
||
|
|
|
||
|
|
public function openFile(){
|
||
|
|
$this->handle = fopen($this->filename, "r");
|
||
|
|
}
|
||
|
|
|
||
|
|
public function closeFile(){
|
||
|
|
fclose($$this->handle);
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
public function clearString(){
|
||
|
|
$this->currentLine=trim($this->currentLine);
|
||
|
|
$this->currentLine=preg_replace('!\s+!', ' ', $this->currentLine);
|
||
|
|
}
|
||
|
|
|
||
|
|
public function getData(){
|
||
|
|
while(!feof($this->handle )){
|
||
|
|
$this->currentLine = fgets($this->handle );
|
||
|
|
|
||
|
|
$this->clearString();
|
||
|
|
|
||
|
|
$this->getOrder();
|
||
|
|
|
||
|
|
|
||
|
|
$this->getPosition();
|
||
|
|
//$this->getOrder();
|
||
|
|
# do same stuff with the $line
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
public function clean($string) {
|
||
|
|
$string = str_replace(' ', '-', $string); // Replaces all spaces with hyphens.
|
||
|
|
|
||
|
|
return preg_replace('/[^A-Za-z0-9\-]/', '', $string); // Removes special chars.
|
||
|
|
}
|
||
|
|
|
||
|
|
public function getPosition(){
|
||
|
|
$array=explode(' ',$this->currentLine);
|
||
|
|
if(is_numeric($array[0])){
|
||
|
|
|
||
|
|
if(is_string($this->clean($array[1]))){
|
||
|
|
$position['line_numer']=$array[0];
|
||
|
|
$position['name']=$array[1];
|
||
|
|
$position['kl']=$array[2];
|
||
|
|
$position['chm']=$array[3];
|
||
|
|
$position['quantity']=(float)str_replace(',','.',$array[4]);
|
||
|
|
$position['weight']=(float)str_replace(',','.',$array[5]);
|
||
|
|
$position['price']=(float)str_replace(',','.',$array[6]);
|
||
|
|
$position['total']=(float)str_replace(',','.',$array[7]);
|
||
|
|
$this->orders[$this->currentOrder][]=$position;
|
||
|
|
|
||
|
|
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
public function getOrder(){
|
||
|
|
$string=substr($this->currentLine,0,11);
|
||
|
|
|
||
|
|
if($string=='oznacz.nad.'){
|
||
|
|
|
||
|
|
$this->currentOrder=substr($this->currentLine,12);
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
}
|
||
|
|
echo "<pre>";
|
||
|
|
$file='multizamowienie.pdf';
|
||
|
|
|
||
|
|
$reader = new Pdf2text;
|
||
|
|
|
||
|
|
$reader->setFilename($file);
|
||
|
|
$reader->decodePDF();
|
||
|
|
|
||
|
|
$file='mojt.txt';
|
||
|
|
$reader->saveContentToFile($file);
|
||
|
|
|
||
|
|
$order = new OrderReader;
|
||
|
|
$order->setFilename($file);
|
||
|
|
$order->openFile();
|
||
|
|
$order->getData();
|
||
|
|
var_dump($order->orders);
|