SitePoint Sponsor

User Tag List

Results 1 to 2 of 2
  1. #1
    SitePoint Zealot y2kbug's Avatar
    Join Date
    Nov 2004
    Location
    Hong Kong
    Posts
    173
    Mentioned
    0 Post(s)
    Tagged
    0 Thread(s)

    xml parsing in php

    I am using php to parse the content of this:
    http://www.engadget.com/rss.xml

    And I have got the error as listed below:
    Code:
    Warning: SimpleXMLElement::__construct() [simplexmlelement.--construct]: Entity: line 25: parser error : Input is not proper UTF-8, indicate encoding ! Bytes: 0xE9 0x67 0xE9 0x20 in /www/virtual/dummy.com/htdocs/common/libraries/MyParser.php  on line 57
    
    Warning: SimpleXMLElement::__construct() [simplexmlelement.--construct]: href="http://www.engadget.com/photos/toshiba-port-g-r700-hands-on/">Toshiba Port in /www/virtual/dummy.com/htdocs/common/libraries/MyParser.php on line 57
    
    Warning: SimpleXMLElement::__construct() [simplexmlelement.--construct]: ^ in /www/virtual/dummy.com/htdocs/common/libraries/MyParser.php on line 57
    
    Fatal error: Uncaught exception 'Exception' with message 'String could not be parsed as XML' in /www/virtual/dummy.com/htdocs/common/libraries/MyParser.php:57 Stack trace: #0 /www/virtual/dummy.com/htdocs/common/libraries/MyParser.php(57): SimpleXMLElement->__construct('<?xml version="...', 16384) #1 /www/virtual/dummy.com/htdocs/common/libraries/MyParser.php(234): MyParser->parseXmlData() #2 /www/virtual/dummy.com/htdocs/cronjob/index.php(120): MyParser->parseFeed() #3 /www/virtual/dummy.com/htdocs/include/application.class.php(238): CornJobApplication->controller_default(Array) #4 /www/virtual/dummy.com/htdocs/include/application.class.php(82): Application->call_controller() #5 /www/virtual/dummy.com/htdocs/include/application-commonapplication.class.php(10): Application->__construct('/cronjob/', Array) #6 /www/virtual/dummy.com/ht in /www/virtual/dummy.com/htdocs/common/libraries/MyParser.php on line 57
    Is there any way to continue the script running after the fatal error?
    or
    Is there any way to check the format of the xml is valid before running the "new SimpleXmlElement" line?

    Attached is the parser script:
    Code:
    class MyParser
    {
    
        private $xmlData; //XML data Read from the Feed Url
        private $curlHandler; // Curl Object //
        private $feedUrl; // Feed Url , Url from which data scrapped
        private $parserData; //Xml Components frpm XML parser
        private $feedResults; // Final Results in Array format //
    
    
        /*
        * MyParser Construct
        * @Param as the Feed Url
        * Init settings
        */
        function __construct($url)
        {
            $this->feedUrl = $url;
            $this->parserData = null;
            $this->xmlData = null;
            $this->feedResults = array();
        }
    
        /*
        * Read Feed Content from Remote Url
        * Fetch XML Content Using CURL
        */
        function readXml()
        {
            if(isset($this->feedUrl)){
                try
                {
                    $this->curlHandler = curl_init($this->feedUrl);
                    curl_setopt($this->curlHandler, CURLOPT_RETURNTRANSFER, true);
                    curl_setopt($this->curlHandler, CURLOPT_HEADER, 0);
                    $this->xmlData = curl_exec($this->curlHandler);
                    curl_close($this->curlHandler);
                    return true;
                }catch(Exception $c){
                    return false;
                }
            }else{
                return false;
            }
        }
    
    
        /*
        * Entry Point for Parsing from XML DATA
        * Data will be parsed from Rss Or Atom Feeds.
        */
    
        function parseXmlData()
        {
            $docElim = new SimpleXmlElement($this->xmlData, LIBXML_NOCDATA);
    
            $this->parserData = $docElim;
    
            //print_r( $docElim ); exit();
            //print_r( $docElim->getDocNamespaces() ); exit();
            //print_r( $docElim->getNamespaces() ); exit();
    
            $is_rss1 = false;
            foreach($docElim->getDocNamespaces() as $nsv)
            {
                if( $nsv=='http://purl.org/rss/1.0/' )
                {
                    $is_rss1 = true;
                    break;
                }
            }
            if(isset($docElim->channel))
            {
                if($is_rss1){
                    $this->parseFromRSS1(); //RSS Feed
                }
                else{
                    $this->parseFromRSS(); //RSS Feed
                }
            }
            else if(isset($docElim->entry))
            {
                $this->parseFromATOM(); //Atom Feed
            }
            /*
            if(isset($docElim->channel)){
                $this->parseFromRSS(); //RSS Feed
            }else if(isset($docElim->entry)){
                $this->parseFromATOM(); //Atom Feed
            }*/
            return $this->feedResults;
        }
    
    
        //Focus
        function parseFromRSS1(){
            $this->feedResults["headInfo"]["feedType"] = "RSS";
            $this->feedResults["headInfo"]["title"] = (string)$this->parserData->channel->title;
            $this->feedResults["headInfo"]["description"] = (string)$this->parserData->channel->description;
            $this->feedResults["headInfo"]["link"] = (string)$this->parserData->channel->link;
            $this->feedResults["headInfo"]["category"] = (string)$this->parserData->channel->category;
            $this->feedResults["headInfo"]["docs"] = (string)$this->parserData->channel->docs;
            $this->feedResults["headInfo"]["copyright"] = (string)$this->parserData->channel->copyright;
            $this->feedResults["headInfo"]["pubDate"] = (string)$this->parserData->channel->pubDate;
            $this->feedResults["headInfo"]["webMaster"] = (string)$this->parserData->channel->webMaster;
            $this->feedResults["headInfo"]["imageUrl"] = (string)$this->parserData->channel->image->url;
            $this->feedResults["headInfo"]["imageWidth"] = (string)$this->parserData->channel->image->width;
            $this->feedResults["headInfo"]["imageHeight"] = (string)$this->parserData->channel->image->height;
            $this->feedResults["headInfo"]["imageLink"] = (string)$this->parserData->channel->image->link;
            $this->feedResults["headInfo"]["imageTitle"] = (string)$this->parserData->channel->image->title;
    
    
            /*
            * Rss Feed Items
            * Items and common fields only
            */
            $rec = 0;
            foreach($this->parserData->item as $key=>$val){
                $this->feedResults["items"][$rec]["title"] = (string)$val->title;
                $this->feedResults["items"][$rec]["description"] = (string)$val->description;
                $this->feedResults["items"][$rec]["link"] = (string)$val->link;
                $this->feedResults["items"][$rec]["comments"] = (string)$val->comments;
                $this->feedResults["items"][$rec]["category"] = (string)$val->category;
                $this->feedResults["items"][$rec]["pubDate"] = (string)$val->pubDate;
                $rec++;
            }
            $this->feedResults["headInfo"]["countRecords"] = $rec;
    
        }
    
    
    
        /*
        * Parser now creates the Feed Results From Rss Feeds/
        * Rss Feedas are popular feeds for news and podcast
        * only comman items are added to results
        */
    
        function parseFromRSS(){
            /*
            * Retrieve Header Information
            * Get Common Header Items
            */
            $this->feedResults["headInfo"]["feedType"] = "RSS";
            $this->feedResults["headInfo"]["title"] = (string)$this->parserData->channel->title;
            $this->feedResults["headInfo"]["description"] = (string)$this->parserData->channel->description;
            $this->feedResults["headInfo"]["link"] = (string)$this->parserData->channel->link;
            $this->feedResults["headInfo"]["category"] = (string)$this->parserData->channel->category;
            $this->feedResults["headInfo"]["docs"] = (string)$this->parserData->channel->docs;
            $this->feedResults["headInfo"]["copyright"] = (string)$this->parserData->channel->copyright;
            $this->feedResults["headInfo"]["pubDate"] = (string)$this->parserData->channel->pubDate;
            $this->feedResults["headInfo"]["webMaster"] = (string)$this->parserData->channel->webMaster;
            $this->feedResults["headInfo"]["imageUrl"] = (string)$this->parserData->channel->image->url;
            $this->feedResults["headInfo"]["imageWidth"] = (string)$this->parserData->channel->image->width;
            $this->feedResults["headInfo"]["imageHeight"] = (string)$this->parserData->channel->image->height;
            $this->feedResults["headInfo"]["imageLink"] = (string)$this->parserData->channel->image->link;
            $this->feedResults["headInfo"]["imageTitle"] = (string)$this->parserData->channel->image->title;
    
    
            /*
            * Rss Feed Items
            * Items and common fields only
            */
            $rec = 0;
            foreach($this->parserData->channel->item as $key=>$val){
                $this->feedResults["items"][$rec]["title"] = (string)$val->title;
                $this->feedResults["items"][$rec]["description"] = (string)$val->description;
                $this->feedResults["items"][$rec]["link"] = (string)$val->link;
                $this->feedResults["items"][$rec]["comments"] = (string)$val->comments;
                $this->feedResults["items"][$rec]["category"] = (string)$val->category;
                $this->feedResults["items"][$rec]["pubDate"] = (string)$val->pubDate;
                $rec++;
            }
            $this->feedResults["headInfo"]["countRecords"] = $rec;
    
        }
    
        /*
        * Parse Data From Atom Content
        * Atom Feeds vary from RSS in elements
        * Here the data is scrapped from Atom Feed.
        */
    
        function parseFromATOM(){
            /*
            * Retrieve Header Information
            * Get Common Header Items
            */
            $this->feedResults["headInfo"]["feedType"] = "ATOM";
            $this->feedResults["headInfo"]["authorName"] = (string)$this->parserData->author->name;
            $this->feedResults["headInfo"]["authorEmail"] = (string)$this->parserData->author->email;
            $this->feedResults["headInfo"]["copyright"] = (string)$this->parserData->author->copyright;
            $this->feedResults["headInfo"]["modified"] = (string)$this->parserData->author->modified;
    
            /*
            * ATOM Feed Items
            * Items and common fields only
            */
            $rec= 0;
            foreach($this->parserData->entry as $key=>$val){
                $this->feedResults["items"][$rec]["title"] = (string)$val->title;
                $this->feedResults["items"][$rec]["linkUrl"] = (string)$val->link{"href"};
                $this->feedResults["items"][$rec]["linkType"] = (string)$val->link->{"type"};
                $this->feedResults["items"][$rec]["issued"] = (string)$val->issued;
                $this->feedResults["items"][$rec]["id"] = (string)$val->id;
                $this->feedResults["items"][$rec]["modified"] = (string)$val->modified;
                $this->feedResults["items"][$rec]["content"] = (string)$val->content;
                $rec++;
            }
            $this->feedResults["headInfo"]["countRecords"] = $rec;
    
        }
    
    
        /*
        * Method is the entry to MyParser
        * Function Called from invoking object
        * @ No parameters
        * Returns the Feed Results in array
        */
    
    
        function parseFeed()
        {
            if($this->readXml())
            {
                if(empty($this->xmlData)) {
                    die ("Nothing to parse this time");
                    return null;
                }else{
                    if(class_exists("SimpleXmlElement")){
                        $results = $this->parseXmlData();
                        return $results;
                    }else{
                        die("LIB XML Not installed");
                        return null;
                    }
                }
            }else{
                die( "Sorry , Cannot read xml data from source");
                return null;
            }
        }
    
    }
    Thanks!!

  2. #2
    SitePoint Addict fattyjules's Avatar
    Join Date
    Dec 2005
    Posts
    295
    Mentioned
    0 Post(s)
    Tagged
    0 Thread(s)
    It doesn't appear to be a valid XML document. Both Chrome and Opera complain about it. Might not be your code's fault.


Bookmarks

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •