* ASP.NET web site scraping script;
* Developed by MishaInTheCloud.com
* Copyright 2009 MishaInTheCloud.com. All rights reserved.
* The use of this script is governed by the CodeProject Open License
* See the following link for full details on use and restrictions.
* http://www.codeproject.com/info/cpol10.aspx
* The above copyright notice must be included in any reproductions of this script.
* values used throughout the script
// urls to call - the login page and the secured page
$urlLogin = "http://www.portauthority.org/atisdnn/default.aspx";
$urlSecuredPage = "http://www.portauthority.org/atisdnn/default.aspx";
// POST names and values to support login
//$nameUsername='txtusername'; // the name of the username textbox on the login form
//$namePassword='txtpassword'; // the name of the password textbox on the login form
//$nameLoginBtn='btnlogin'; // the name of the login button (submit) on the login form
//$valUsername ='myUsername'; // the value to submit for the username
//$valPassword ='myPassword'; // the value to submit for the password
//$valLoginBtn ='Login'; // the text value of the login button itself
$valuePair = array(
'txtDeparture' => '4628 Henry Street',
'radDeparture' => 'I',
'txtArrival' => 'Airport Pittsburgh International',
'radArrival' => 'I',
'ddlArrDep' => 'D',
'ddlHours' => '7',
'ddlMinutes' => '55',
'ddlAmPm' => 'PM',
'Calendar1_txtCalendar' => '4/29/2010',
'ddlTripPreference' => 'I',
'lblWalkDist' => '.25',
'ddlNumItins' => '1',
'hdnTripDate' => '4/29/2010',
'lblHiddenSearchType' => '',
'lblHiddenSearch' => '',
'txtHiddenLocationIndex' => '',
'txtHiddenArrivalIndex' => '',
'txtHiddenDepartureIndex' => '',
'btnSubmit' => 'Submit',
'__EVENTTARGET' => '',
// the path to a file we can read/write; this will
// store cookies we need for accessing secured pages
$cookies = 'someReadableWritableFileLocation\cookie.txt';
// regular expressions to parse out the special ASP.NET
$regexViewstate = '/__VIEWSTATE\" value=\"(.*)\"/i';
$regexEventVal = '/__EVENTVALIDATION\" value=\"(.*)\"/i';
* utility function: regexExtract
* use the given regular expression to extract
* a value from the given text; $regs will
* be set to an array of all group values
* (assuming a match) and the nthValue item
* from the array is returned as a string
function regexExtract($text, $regex, $regs, $nthValue)
if (preg_match($regex, $text, $regs)) {
$result = $regs[$nthValue];
else {
$result = "";
return $result;
* initialize a curl handle; we'll use this
* handle throughout the script
$ch = curl_init();
* first, issue a GET call to the ASP.NET login
* page. This is necessary to retrieve the
* that the server issues
curl_setopt($ch, CURLOPT_URL, $urlLogin);
// from the returned html, parse out the __VIEWSTATE and
$viewstate = regexExtract($data,$regexViewstate,$regs,1);
$eventval = regexExtract($data, $regexEventVal,$regs,1);
* now issue a second call to the Login page;
* this time, it will be a POST; we'll send back
* as post data the __VIEWSTATE and __EVENTVALIDATION
* values the server previously sent us, as well as the
* username/password. We'll also set up a cookie
* jar to retrieve the authentication cookie that
* the server will generate and send us upon login.
$postData = '__VIEWSTATE='.rawurlencode($viewstate)
foreach ($valuePair as $key => $value) {
$postData = $postData . '&'. $key.'='.$value;
curl_setopt($ch, CURLOPT_POST, TRUE);
curl_setopt($ch, CURLOPT_POSTFIELDS, $postData);
curl_setopt($ch, CURLOPT_URL, $urlLogin);
//curl_setopt($ch, CURLOPT_COOKIEJAR, $cookieFile);
$data = curl_exec($ch);
* with the authentication cookie in the jar,
* we'll now issue a GET to the secured page;
* we set curl's COOKIEFILE option to the same
* file we used for the jar before to ensure the
* authentication cookie is sent back to the
* server
//curl_setOpt($ch, CURLOPT_POST, FALSE);
//curl_setopt($ch, CURLOPT_URL, $urlSecuredPage);
//curl_setopt($ch, CURLOPT_COOKIEFILE, $cookieFile);
//$data = curl_exec($ch);
// at this point the secured page may be parsed for
// values, or additional POSTS made to submit parameters
// and retrieve data. For this sample, we'll just
// echo the results.
echo $data;
* that's it! Close the curl handle
Thursday, April 29, 2010
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment