<?php
/**
 * This class is remodeling of HTMLScraping
 * 
 * @see http://www.rcdtokyo.com/etc/htmlscraping/
 */

/**
 * ---------------------------------------------------------------------
 * HTMLScraping class
 * ---------------------------------------------------------------------
 * PHP versions 5 (5.1.3 and later)
 * ---------------------------------------------------------------------
 * LICENSE: This source file is subject to the GNU Lesser General Public
 * License as published by the Free Software Foundation;
 * either version 2.1 of the License, or any later version
 * that is available through the world-wide-web at the following URI:
 * http://www.gnu.org/licenses/lgpl.html
 * If you did not have a copy of the GNU Lesser General Public License
 * and are unable to obtain it through the web, please write to
 * the Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 * ---------------------------------------------------------------------
 */

require_once 'Diggin/Scraper/Adapter/Interface.php';

class 
Diggin_Scraper_Adapter_Htmlscraping implements Diggin_Scraper_Adapter_Interface 
{
    protected 
$config = array();

    
/*
     * @var array
     */
    
private $backup = array();

    
/*
     * @var integer
     */
    
private $backup_count 0;
    
    
    final public function 
getXmlObject($response)
    {
        try {
            
$xhtml $this->getXhtml($response);
        } catch (
Exception $e) {
            throw 
$e;
        }
        
        
        
/*
         * Remove default namespace.
         * This is because that SimpleXMLElement->registerXPathNamespace() may cause
         * a problem under some circumstances (confirmed with PHP 5.1.6 so far).
         * So you do not need to use SimpleXMLElement->registerXPathNamespace()
         * when you use SimpleXMLElement->xpath().
         */
        
$responseBody preg_replace('/\sxmlns="[^"]+"/'''$xhtml);

        
/*
         * Replace every '&' with '&amp;'
         * for XML parser not to break on non-predefined entities.
         * So you may need to replace '&amp;' with '&'
         * to have the original HTML string from returned SimpleXML object.
         */
        
$responseBody str_replace('&''&amp;'$responseBody);
        try {
            
$xml_object = @new SimpleXMLElement($responseBody);
        } catch (
Exception $e) {
            throw 
$e;
        }
        
        if (
$bases $xml_object->xpath('//base[@href]')) {
            
$bases[0]['href'] = $this->getAbsoluteUrl((string) $bases[0]['href'], $this->config['url']);
        } else {
            if (!
$xml_object->head) {
                
$xml_object->addChild('head');
            }
            
$base $xml_object->head->addChild('base');
            
$base->addAttribute('href'$this->config['url']);
        }
        return 
$xml_object;
    }

    
/**
     * Return XHTML string based on SimpleXML element.
     *
     * @param  object  $element
     * @return string
     */
    
final public function dumpElement($element)
    {
        return 
str_replace('&amp;''&'$element->asXML());
    }

    
/**
     * Return array contains formated XHTML string
     * created from the responded HTML of the given URL.
     * array[code] => HTTP status code
     * array[headers] => HTTP headers
     * array[headers] => formated XHTML string made from the entity body
     * Throw exception if error.
     *
     * @param  string  $url
     * @param  string $responseBody
     * @return array
     */
    
final public function getXhtml($response)
    {
        
/*
         * Remove BOM and NULLs.
         */
        
$responseBody preg_replace('/^\xef\xbb\xbf/''' $response->getBody());
        
$responseBody str_replace("\x0"''$responseBody);
        
/*
         * Initialize the backups.
         */
        
$this->backup = array();
        
$this->backup_count 0;
        
/*
         * Removing SCRIPT and STYLE is recommended.
         * The following substitute code will capsulate the content of the tags in CDATA.
         * If use it, be sure that some JavaScript method such as document.write
         * is not compliant with XHTML/XML.
         */
        
$tags = array('script''style');
        foreach (
$tags as $tag) {
            
$responseBody preg_replace("/<$tag\b[^>]*?>.*?<\/$tag\b[^>]*?>/si"'' $responseBody);
            
/*
            $responseBody = preg_replace_callback(
                "/(<$tag\b[^>]*?>)(.*?)(<\/$tag\b[^>]*?>)/si",
                create_function('$matches', '
                    $content = trim($matches[2]);
                    if (empty($content)
                        or preg_match("/^<!\[CDATA\[.*?\]\]>$/s", $content)) {
                        return $matches[0];
                    } else {
                        $content = preg_replace("/^<!-+/", "", $content);
                        $content = preg_replace("/-+>$/", "", $content);
                        $content = preg_replace("/\s*\/\/$/s", "", trim($content));
                        return "$matches[1]<![CDATA[\n$content\n]]>$matches[3]";
                    }
                '),
                $responseBody
            );
            */
        
}
        
/*
         * Backup CDATA sections for later process.
         */
        
$responseBody preg_replace_callback(
            
'/<!\[CDATA\[.*?\]\]>/s', array($this'backup'), $responseBody
        
);
        
/*
         * Comment section must not contain two or more adjacent hyphens.
         */
        
$responseBody preg_replace_callback(
            
'/<!--(.*?)-->/si',
            
create_function('$matches''
                return "<!-- ".preg_replace("/-{2,}/", "-", $matches[1])." -->";
            '
),
            
$responseBody
        
);
        
/*
         * Backup comment sections for later process.
         */
        
$responseBody preg_replace_callback(
            
'/<!--.*?-->/s', array($this'backup'), $responseBody
        
);
        
/*
         * Process tags that is potentially dangerous for XML parsers.
         */
        
$responseBody preg_replace_callback(
            
'/(<textarea\b[^>]*?>)(.*?)(<\/textarea\b[^>]*?>)/si',
            
create_function('$matches''
                return $matches[1].str_replace("<", "&lt;", $matches[2]).$matches[3];
            '
),
            
$responseBody
        
);
        
$responseBody preg_replace_callback(
            
'/<xmp\b[^>]*?>(.*?)<\/xmp\b[^>]*?>/si',
            
create_function('$matches''
                return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
            '
),
            
$responseBody
        
);
        
$responseBody preg_replace_callback(
            
'/<plaintext\b[^>]*?>(.*)$/si',
            
create_function('$matches''
                return "<pre>".str_replace("<", "&lt;", $matches[1])."</pre>";
            '
),
            
$responseBody
        
);
        
/*
         * Remove DTD declarations, wrongly placed comments etc.
         * This must be done before removing DOCTYPE.
         */
        
$responseBody preg_replace('/<!(?!DOCTYPE)[^>]*?>/si'''$responseBody);
        
/*
         * XML and DOCTYPE declaration will be replaced.
         */
        
$responseBody preg_replace('/<!DOCTYPE\b[^>]*?>/si'''$responseBody);
        
$responseBody preg_replace('/<\?xml\b[^>]*?\?>/si'''$responseBody);
        if (
preg_match('/^\s*$/s'$responseBody)) {
            throw new 
Exception('The entity body became empty after preprocessing.');
        }
        
/*
         * Detect character encoding and convert to UTF-8.
         */
        
$encoding false;
        if (!(
$response->getHeader('content-type'))) {
            
$encoding $this->_getCharsetFromCType($response->getHeader('content-type'));
        }
        if (!
$encoding and preg_match_all('/<meta\b[^>]*?>/si'$responseBody$matches)) {
            foreach (
$matches[0] as $value) {
                if (
strtolower($this->_getAttribute('http-equiv'$value)) == 'content-type'
                    
and false !== $encoding $this->_getAttribute('content'$value)) {
                    
$encoding $this->_getCharsetFromCType($encoding);
                    break;
                }
            }
        }
        
/*
         * Use mbstring to convert character encoding if available.
         * Otherwise use iconv (iconv may try to detect character encoding automatically).
         * Do not trust the declared encoding and do conversion even if UTF-8.
         */
        
if (extension_loaded('mbstring')) {
            if (!
$encoding) {
                @
mb_detect_order('ASCII, JIS, UTF-8, EUC-JP, SJIS');
                if (
false === $encoding = @mb_preferred_mime_name(@mb_detect_encoding($responseBody))) {
                    throw new 
Exception('Failed detecting character encoding.');
                }
            }
            @
mb_convert_variables('UTF-8'$encoding$responseBody$this->backup);
        } else {
            if (
false === $responseBody = @iconv($encoding'UTF-8'$responseBody)) {
                throw new 
Exception('Failed converting character encoding.');
            }
            foreach (
$this->backup as $key => $value) {
                if (
false === $this->backup[$key] = @iconv($encoding'UTF-8'$value)) {
                    throw new 
Exception('Failed converting character encoding.');
                }
            }
        }
        
/*
         * Restore CDATAs and comments.
         */
        
for ($i 0$i $this->backup_count$i++) {
            
$responseBody str_replace("<restore count=\"$i\" />"$this->backup[$i], $responseBody);
        }
        
/*
         * Use Tidy to format HTML if available.
         * Otherwise, use HTMLParser class (is slower and consumes much memory).
         */
        
if (extension_loaded('tidy')) {
            
$tidy = new tidy;
            
$tidy->parseString($responseBody, array('output-xhtml' => true), 'UTF8');
            
$tidy->cleanRepair();
            
$responseBody $tidy->html();
        } else {
            require_once 
'HTMLParser.class.php';
            
$parser = new HTMLParser;
            
$format_rule = require 'xhtml1-transitional_dtd.inc.php';
            
$parser->setRule($format_rule);
            
$parser->setRoot('html', array('xmlns' => 'http://www.w3.org/1999/xhtml'));
            
$parser->setGenericParent('body');
            
$parser->parse($responseBody);
            
$responseBody $parser->dump();
        }
        
/*
         * Valid XHTML DOCTYPE declaration (with DTD URI) is required
         * for SimpleXMLElement->asXML() method to produce proper XHTML tags.
         */
        
$declarations '<?xml version="1.0" encoding="UTF-8"?>';
        
$declarations .= '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" ';
        
$declarations .= '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">';
        
$responseBody "$declarations$responseBody";

        return 
$responseBody;
    }

    
/**
     * @param  string  $string
     * @return mixed
     */
    
private function _getCharsetFromCType($string)
    {
        
$array explode(';'$string);
        
/* array_walk($array, create_function('$item', 'return trim($item);')); */
        
if (isset($array[1])) {
            
$array explode('='$array[1]);
            if (isset(
$array[1])) {
                
$charset trim($array[1]);
                if (
preg_match('/^UTF-?8$/i'$charset)) {
                    return 
'UTF-8';
                } elseif (
function_exists('mb_preferred_mime_name')) {
                    return @
mb_preferred_mime_name($charset);
                } else {
                    return 
$charset;
                }
            }
        }
        return 
false;
    }

    
/**
     * @param  string  $name
     * @param  string  $string
     * @return mixed
     */
    
private function _getAttribute($name$string)
    {
        
$search "'[\s\'\"]\b".$name."\b\s*=\s*([^\s\'\">]+|\'[^\']+\'|\"[^\"]+\")'si";
        if (
preg_match($search$string$matches)) {
            return 
preg_replace('/^\s*[\'\"](.+)[\'\"]\s*$/s''$1'$matches[1]);
        } else {
            return 
false;
        }
    }

    
/**
     * @param  array   $matches
     * @return string
     */
    
private function backup($matches)
    {
        
$this->backup[] = $matches[0];
        
$replace "<restore count=\"{$this->backup_count}\" />";
        
$this->backup_count++;
        
        return 
$replace;
    }
    
    
/**
     * 
     * @return 
     */
    
public function readData($response)
    {
        return 
$this->getXmlObject($response);
    }
    
    public function 
setConfig($config = array())
    {
        if (! 
is_array($config))
            throw new 
Diggin_Scraper_Adapter_Exception('Expected array parameter, given ' gettype($config));

        foreach (
$config as $k => $v)
            
$this->config[strtolower($k)] = $v;

        return 
$this;
    }

    
/**
     * @param  string  $url
     * @param  string  $base_url
     * @return string
     */
    
public static function getAbsoluteUrl($url$base_url)
    {
        if (
preg_match('/^[\w\+\-\.]+:/'$url) or false === $bases = @parse_url($base_url)) {
            return 
$url;
        } elseif (
=== strpos($url'/')) {
            return 
"$bases[scheme]://$bases[host]".(isset($bases['port'])? ":$bases[port]"'').$url;
        } else {
            if (!isset(
$bases['path'])) {
                
$bases['path'] = '/';
            }
            return 
"$bases[scheme]://$bases[host]".(isset($bases['port'])? ":$bases[port]"'').
                
Net_URL::resolvePath(substr($bases['path'], 0strrpos($bases['path'], '/') +1).$url);
        }
    }
}