| 
<?
 /**
 * About author:
 *  vivekanandan
 * email: [email protected]
 *
 * If you want to any help on spider ot any thing in php just mail me
 *
 * About class:
 *  WebSpider    -  constructor set teh domain & url to map it
 *
 *   processTagInPageData()     - it process the anchor tag & frame tag as googlebot does
 *   fetchURLPageData()         - it returns the html page content for the given URL
 *     isURLExists()            - it checks wheather the given url is added in DB
 *   displayDomainRecords    - it displays teh records  from DB
 *     StoreUniqueURL             - it stores the unique url in DB
 *     processSpecificTagbyType-  it parse each & every tag & truncate the parsed tag from the string
 */
 
 
 
 ini_set("display_errors",1);
 
 
 /* table structure
 
 CREATE TABLE `spider` (
 `id` bigint(20) NOT NULL auto_increment,
 `domain` varchar(150) NOT NULL,
 `url` varchar(2000) NOT NULL,
 `parentid` int(11) NOT NULL,
 `visitflag` int(11) NOT NULL,
 `type` varchar(20) NOT NULL,
 `level` mediumint(9) NOT NULL,
 PRIMARY KEY  (`id`)
 )
 */
 
 
 class WebSpider {
 
 var $mMaxDepth;
 var $mDomain;
 var $mDBHost;
 var $mDBUserName;
 var $mDBPassword;
 var $mDBDatabase;
 var $mURLPageData;
 var $mURL;
 
 
 function WebSpider($pmDomain, $pmDepth,$pmURL) {
 
 $this->mDomain        = $pmDomain;
 $this->mMaxDepth    = $pmDepth;
 $this->mURL            = $pmURL;
 
 }
 
 function isURLExists($pmDomain, $pmURL) {
 
 mysql_connect($this->mDBHost,$this->mDBUserName,$this->mDBPassword);
 mysql_select_db($this->mDBDatabase);
 
 $vSQL  = "SELECT count( id ) AS cnt FROM spider
 WHERE  domain = '$pmDomain' and url = '$pmURL'";
 $rs = mysql_query($vSQL);
 $oRecord = mysql_fetch_assoc($rs);
 return $oRecord['cnt'];
 
 }
 
 function displayDomainRecords($pmDomain){
 
 mysql_connect($this->mDBHost,$this->mDBUserName,$this->mDBPassword);
 mysql_select_db($this->mDBDatabase);
 
 $vSQL = "select count(id) as cnt   from  spider where domain = '$pmDomain' ";
 $rsURLList = mysql_query($vSQL);
 $vCnt = mysql_fetch_assoc($rsURLList);
 
 
 $vSQL = "select *   from  spider where domain = '$pmDomain' order by id asc ";
 $rsURLList = mysql_query($vSQL);
 print "<strong>Domain</strong> : ".$pmDomain ." <strong>Total URL</strong> :".$vCnt['cnt'];
 ?>
 <table width='80%'  cellspacing='2' cellpadding='2' border="1">
 <tr>
 <td><strong>URL</strong></td>
 <td><strong>Type</strong></td>
 </tr>
 <?
 while($aRec = mysql_fetch_assoc($rsURLList)) { ?>
 <tr>
 <td><?php echo $aRec['url'] ?></td>
 <td><?php echo htmlspecialchars($aRec['type'] )?></td>
 
 </tr>
 <? }  ?>
 </table>
 
 <?
 
 
 
 }
 
 function StoreUniqueURL($pmDomain, $pmURL, $pmParentId=0, $pmLevel,$pmType){
 
 mysql_connect($this->mDBHost,$this->mDBUserName,$this->mDBPassword);
 mysql_select_db($this->mDBDatabase);
 
 $pmURL = mysql_real_escape_string($pmURL);
 if($this->isURLExists($pmDomain,$pmURL)==0) {
 $vURLSQL = " INSERT INTO `spider` ( domain, `url` , `parentid` , `visitflag` , `type` , `level` )
 VALUES ('$pmDomain' , '$pmURL', '$pmParentId', '0', '$pmType', '$pmLevel' )";
 mysql_query($vURLSQL);
 }
 
 }
 
 
 function fetchLinkfromTag($pmData, $pmTagName, $pmAtributeName){
 
 $vPos = strpos($pmData, $pmTagName);
 if($vPos === false){
 return false; // if no link found stop search
 }
 $vPos += strlen($vStr);
 $vSubStr = substr($pmData,$vPos);
 
 $vHrefPos = strpos($vSubStr, $pmAtributeName);
 $vSubStr = substr($vSubStr,  $vHrefPos);
 
 
 $url = explode('"',$vSubStr);
 return array("url"=>$url[1],"str"=>$vSubStr);
 
 }
 
 
 
 
 function fetchURLPageData($vURL) {
 
 $rCurlRes = curl_init();
 curl_setopt($rCurlRes, CURLOPT_URL,$vURL);
 curl_setopt($rCurlRes, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.13)');
 curl_setopt($rCurlRes, CURLOPT_REFERER, $this->mDomain);
 curl_setopt($rCurlRes, CURLOPT_AUTOREFERER, true);
 curl_setopt($rCurlRes, CURLOPT_HEADER, 0); // set to 0 to eliminate header info from response
 curl_setopt($rCurlRes, CURLOPT_RETURNTRANSFER, 1); // Returns response data instead of TRUE(1
 $res = curl_exec($rCurlRes);
 return $res;
 }
 
 function ProcessSpiderInit() {
 
 $this->StoreUniqueURL($this->mDomain, $this->mURL, 0,1,'index');
 
 }
 
 
 function processSpecificTagbyType($pmData, $pmTagName, $pmAttribute) {
 
 do {
 $aResult = $this->fetchLinkfromTag($pmData,$pmTagName, $pmAttribute);
 
 $vURL     = $aResult['url'];
 $pmData = $aResult['str'];
 if($pmData) {
 $this->StoreUniqueURL($this->mDomain, $vURL , 1,  1, $pmTagName);
 }
 $vIndex++;
 
 }while($pmData);
 
 }
 
 
 
 function processTagInPageData($pmData) {
 
 $this->processSpecificTagbyType($pmData,'<a',"href=");
 $this->processSpecificTagbyType($pmData,'<frame', "src=");
 
 }
 
 
 function fetchURLDataandParseURL() {
 
 $vData   = $this->fetchURLPageData($this->mURL);
 $this->processTagInPageData($vData);
 }
 
 }
 
 ?>
 |