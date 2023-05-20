QUESTION:

Why Xml Parser shows error if there are xml tag errors on third party websites crawler is presently crawling ?

DESCRIPTION:

I get echoed:

**( ! ) Warning: DOMDocument::loadXML(): Start tag expected, '<' not found in Entity, line: 6 in C:\wamp64\www\... on line 44 Call Stack # Time Memory Function Location 1 0.0034 362440 {main}( ) ...\crawler_Test.php:0 2 14.0745 365456 loadXML( $source = class SimpleXMLElement { public $sitemap = [0 => class SimpleXMLElement { ... }, 1 => class SimpleXMLElement { ... }, 2 => class SimpleXMLElement { ... }, 3 => class SimpleXMLElement { ... }] } ) 46 73 SiteMaps Crawled: --- Array ( ) Html Pages Crawled: --- Array ( ) Array ( ) Array ( ) Array ( ) 50 Array ( ) Array ( ) Array ( ) **

Error gets shown here:

$dom->loadXML($xml); //LINE: 44

I can’t afford to get errors like this as not all websites will be code error free.

I want to suppress such error that gets shown due to coding errors on webpages crawled on third party websites. How to do this ?

NOTE:

I should only get error if my own code (crawler is coded in error) has errors.

I do not know how to suppress the error.

I would like to know if there are any error on my coding that could cause issues while crawling the web or later on at any point.

<?php ini_set('display_errors',1); ini_set('display_startup_errors',1); error_reporting(E_ALL); //START OF SCRIPT FLOW. //Preparing Crawler & Session: Initialising Variables. //Preparing $ARRAYS For Step 1: To Deal with Xml Links meant for Crawlers only. //SiteMaps Details Scraped from SiteMaps or Xml Files. $sitemaps = []; //This will list extracted further Xml SiteMap links (.xml) found on Sitemaps (.xml). $sitemaps_last_mods = []; //This will list dates of SiteMap pages last modified - found on Sitemaps. $sitemaps_change_freqs = []; //his will list SiteMap dates of html pages frequencies of page updates - found on Sitemaps. $sitemaps_priorities = []; //This will list SiteMap pages priorities - found on Sitemaps. //Webpage Details Scraped from SiteMaps or Xml Files. $html_page_urls = []; //This will list extracted html links Urls (.html, .htm, .php) - found on Sitemaps (.xml). $html_page_last_mods = []; //This will list dates of html pages last modified - found on Sitemap. $html_page_change_freqs = []; //his will list dates of html pages frequencies of page updates - found on Sitemaps. $html_page_priorities = []; //This will list html pages priorities - found on Sitemaps. //Preparing $ARRAYS For Step 2: To Deal with html pages meant for Human Visitors only. //Data Scraped from Html Files. Not Xml SiteMap Files. $html_page_meta_names = []; //This will list crawled pages Meta Tag Names - found on html pages. $html_page_meta_descriptions = []; //This will list crawled pages Meta Tag Descriptions - found on html pages. $html_page_titles = []; //This will list crawled pages Titles - found on html pages. // ----- //Step 1: Initiate Session - Feed Xml SiteMap Url. Crawing Starting Point. //Crawl Session Starting Page/Initial Xml Sitemap. (NOTE: Has to be .xml SItemap). $initial_url = "https://www.rocktherankings.com/sitemap_index.xml"; //Has more xml files. //$xmls = file_get_contents($initial_url); //Should I stick to this line or below line ? //Parse the sitemap content to object //$xml = simplexml_load_string($xmls); //Should I stick to this line or above line ? $xml = simplexml_load_string(file_get_contents($initial_url)); //Code from Dani: https://www.daniweb.com/programming/web-development/threads/540168/what-to-lookout-for-to-prevent-crawler-traps $dom = new DOMDocument(); $dom->loadXML($xml); //LINE: 44 echo __LINE__; echo '<br>'; //LINE: 46 extract_links($xml); echo __LINE__; echo '<br>'; //LINE: 50 foreach($sitemaps AS $sitemap) { echo __LINE__; echo '<br>'; extract_links($sitemap); //Extract Links on page. } foreach($html_page_urls AS $html_page_url) { echo __LINE__; echo '<br>'; extract_links($html_page_url); //Extract Links on page. } scrape_page_data(); //Scrape Page Title & Meta Tags. //END OF SCRIPT FLOW. //DUNCTIONS BEYOND THIS POINT. //Links Extractor. function extract_links() { echo __LINE__; echo '<br>'; //LINE: 73 GLOBAL $dom; //Trigger following IF/ELSEs on each Crawled Page to check for link types. Whether Links lead to more SiteMaps (.xml) or webpages (.html, .htm, .php, etc.). if ($dom->nodeName === 'sitemapindex') //Current Xml SiteMap Page lists more Xml SiteMaps. Lists links to Xml links. Not lists links to html links. { echo __LINE__; echo '<br>'; //parse the index // retrieve properties from the sitemap object foreach ($xml->sitemapindex as $urlElement) //Extracts html file urls. { // get properties $sitemaps[] = $sitemap_url = $urlElement->loc; $sitemaps_last_mods[] = $last_mod = $urlElement->lastmod; $sitemaps_change_freqs[] = $change_freq = $urlElement->changefreq; $sitemaps_priorities[] = $priority = $urlElement->priority; // print out the properties echo 'url: '. $sitemap_url . '<br>'; echo 'lastmod: '. $last_mod . '<br>'; echo 'changefreq: '. $change_freq . '<br>'; echo 'priority: '. $priority . '<br>'; echo '<br>---<br>'; } } else if ($dom->nodeName === 'urlset') //Current Xml SiteMap Page lists no more Xml SiteMap links. Lists only html links. { echo __LINE__; echo '<br>'; //parse url set // retrieve properties from the sitemap object foreach ($xml->urlset as $urlElement) //Extracts Sitemap Urls. { // get properties $html_page_urls[] = $html_page_url = $urlElement->loc; $html_page_last_mods[] = $last_mod = $urlElement->lastmod; $html_page_change_freqs[] = $change_freq = $urlElement->changefreq; $html_page_priorities[] = $priority = $urlElement->priority; // print out the properties echo 'url: '. $html_page_url . '<br>'; echo 'lastmod: '. $last_mod . '<br>'; echo 'changefreq: '. $change_freq . '<br>'; echo 'priority: '. $priority . '<br>'; echo '<br>---<br>'; } } GLOBAL $sitemaps; GLOBAL $sitemaps_last_mods; GLOBAL $sitemaps_change_freqs; GLOBAL $sitemaps_priorities; GLOBAL $html_page_urls; GLOBAL $html_page_last_mods; GLOBAL $html_page_change_freqs; GLOBAL $html_page_priorities; echo 'SiteMaps Crawled: ---'; echo '<br><br>'; if(array_count_values($sitemaps)>0) { print_r($sitemaps); echo '<br>'; } elseif(array_count_values($sitemaps_last_mods)>0) { print_r($sitemaps_last_mods); echo '<br>'; } elseif(array_count_values($sitemaps_change_freqs)>0) { print_r($sitemaps_change_freqs); echo '<br>'; } elseif(array_count_values($sitemaps_priorities)>0) { print_r($sitemaps_priorities); echo '<br><br>'; } echo 'Html Pages Crawled: ---'; echo '<br><br>'; if(array_count_values($html_page_urls)>0) { print_r($html_page_urls); echo '<br>'; } if(array_count_values($html_page_last_mods)>0) { print_r($html_page_last_mods); echo '<br>'; } if(array_count_values($html_page_change_freqs)>0) { print_r($html_page_change_freqs); echo '<br>'; } if(array_count_values($html_page_priorities)>0) { print_r($html_page_priorities); echo '<br>'; } } //Meta Data & Title Extractor. function scrape_page_data() { GLOBAL $html_page_urls; if(array_count_values($html_page_urls)>0) { foreach($html_page_urls AS $url) { // https://www.php.net/manual/en/function.file-get-contents $html = file_get_contents($url); //https://www.php.net/manual/en/domdocument.construct.php $doc = new DOMDocument(); // https://www.php.net/manual/en/function.libxml-use-internal-errors.php libxml_use_internal_errors(true); // https://www.php.net/manual/en/domdocument.loadhtml.php $doc->loadHTML($html, LIBXML_COMPACT|LIBXML_NOERROR|LIBXML_NOWARNING); // https://www.php.net/manual/en/function.libxml-clear-errors.php libxml_clear_errors(); // https://www.php.net/manual/en/domdocument.getelementsbytagname.php $meta_tags = $doc->getElementsByTagName('meta'); // https://www.php.net/manual/en/domnodelist.item.php if ($meta_tags->length > 0) { // https://www.php.net/manual/en/class.domnodelist.php foreach ($meta_tags as $tag) { // https://www.php.net/manual/en/domnodelist.item.php echo 'Meta Name: ' .$meta_name = $tag->getAttribute('name'); echo '<br>'; echo 'Meta Content: ' .$meta_content = $tag->getAttribute('content'); echo '<br>'; $html_page_meta_names[] = $meta_name; $html_page_meta_descriptions[] = $meta_content; } } //EXAMPLE 1: Extract Title $title_tag = $doc->getElementsByTagName('title'); if ($title_tag->length>0) { echo 'Title: ' .$title = $title_tag[0]->textContent; echo '<br>'; $html_page_titles[] = $title; } //EXAMPLE 2: Extract Title $title_tag = $doc->getElementsByTagName('title'); for ($i = 0; $i < $title_tag->length; $i++) { echo 'Title: ' .$title = $title_tag->item($i)->nodeValue . "

"; $html_page_titles[] = $title; } } } } if(array_count_values($html_page_meta_names)>0) { print_r($html_page_meta_names); echo '<br>'; } if(array_count_values($html_page_meta_descriptions)>0) { print_r($html_page_meta_descriptions); echo '<br>'; } if(array_count_values($html_page_titles)>0) { print_r($html_page_titles); echo '<br>'; } //END OF FUNCTIONS. ?>

