In the code below, I’m scraping a page using DOMDocument and even
though the element I’m looking for is in the source, the getElementByID call
is not returning the node.
Can you see what I’m doing wrong?
<?php
if (isset($_GET["partno"])) {
$partno = $_GET["partno"];
} else {
exit;
}
/************************************************
* Let's navigate to the search page so we can
* get the __VIEWSTATE and __EVENTVALIDATION
* values for this aspx page.
*************************************************/
$ch1 = curl_init();
curl_setopt($ch1, CURLOPT_HEADER, false);
curl_setopt($ch1, CURLOPT_URL, "http://www.dlis.dla.mil/WebFlis/pub/pub_search.aspx");
curl_setopt($ch1, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch1, CURLOPT_RETURNTRANSFER, true);
$returned = curl_exec($ch1);
curl_close($ch1);
/************************************************
* Specify Tidy configuration
*************************************************/
$config = array('indent' => true,'output-xhtml' => true, "wrap" => 240);
/************************************************
* Clean up the HTML returned from the search
* Boy they have some crappy HTML!!!!
*************************************************/
$tidy = new tidy;
$tidy->parseString($returned, $config, 'utf8');
$tidy->cleanRepair();
/************************************************
* Now use DOMDocument to parse out the results
*************************************************/
$doc = new DOMDocument();
$doc->loadHTML($tidy);
$nodes = $doc->getElementsByTagName('input');
/************************************************
* Find the __VIEWSTATE and __EVENTVALIDATION
* values
*************************************************/
for($i=0; $i<$nodes->length; $i++) {
if ($nodes->item($i)->getAttribute('name') == "__VIEWSTATE" ) {
$viewstate = $nodes->item($i)->getAttribute('value');
}
if ($nodes->item($i)->getAttribute('name') == "__EVENTVALIDATION" ) {
$eventvalidation = $nodes->item($i)->getAttribute('value');
}
}
/************************************************
* Now fill in the values and search
*************************************************/
$ch2 = curl_init();
curl_setopt($ch2, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch2, CURLOPT_URL, "http://www.dlis.dla.mil/WebFlis/pub/pub_search.aspx");
curl_setopt($ch2, CURLOPT_HEADER, false);
curl_setopt($ch2, CURLOPT_POST, true);
curl_setopt($ch2, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch2, CURLOPT_POSTFIELDS, array('__VIEWSTATE'=>$viewstate,
'__EVENTVALIDATION'=>$eventvalidation,
'btnNIIN' => 'Go', 'txtPART'=>$partno,
'C1'=>'on','C6'=>'on','C4'=>'on',));
$returned = curl_exec($ch2);
// Simple error checking here
if (!$returned) {
echo curl_error($ch2);
}
curl_close($ch2);
/************************************************
* Clean up the HTML returned from the search
*************************************************/
$tidy->parseString($returned, $config, 'utf8');
$tidy->cleanRepair();
/************************************************
* Reload the DOMDocument to parse out the
* new results
*************************************************/
$doc->loadHTML($tidy);
/************************************************
* When I check the source here, the dgdPart table
* exists in the source but the next line fails
* to return anything.
************************************************/
$dgdParttable = $doc->getElementByID('dgdPart');
if (isset($dgdParttable)) {
$rows = $dgdParttable->item(0)->getElementsByTagName('tr');
foreach ($rows as $row)
{
$cols = $row->getElementsByTagName('td');
$decription = $cols->item(0)->nodeValue;
$qrystr = parse_url($cols->item(1)->nodeValue, PHP_URL_QUERY);
parse_str($qrystr);
$niins_array[] = $niin;
$insert_sql = 'INSERT INTO NIINS (DESCRIPTION, NSN) VALUES ("'.$description.'", "'.$niin.'")';
echo $insert_sql."<br>\
";
// Not ready to insert into database, yet.
//$result = $db->query($insert_sql);
//$result_row = $db->fetchRow($result);
//$niins_id = $result_row['NIINS_ID'];
}
} else {
die("Couldn't get dgdParttable");
}
?>