| 
 | 
	
 
 
用PHP抓取网页  
 
 |   |  作者: 共创联盟  
加入时间: 2003-11-24  
浏览次数: 322  
 
  抓取网页,并将文字和图片存入数据库中,利用getimg.php?id=读取数据库中的图片 
getarticle.php?id=读取文档  
  <? 
 
/**建表文档 articletype对应的类型 1:oracle,2:java,3:system 
CREATE TABLE article ( 
 id int(6) NOT NULL auto_increment, 
 title varchar(80) default NULL, 
 content text, 
 url varchar(80) default NULL, 
 joindate varchar(12) default NULL, 
 articletype int(2) not null, 
 PRIMARY KEY (id) 
) ; 
CREATE TABLE images ( 
 id int(4) NOT NULL auto_increment, 
 bin_data longblob, 
 filetype varchar(50) default NULL, 
 title varchar(50) default NULL, 
 articleid int(6) NOT NULL, 
 PRIMARY KEY (id) 
) TYPE=MyISAM; 
*/ 
 
class SaveWeb 
{ 
var $title; 
var $url; 
var $typeid; 
var $content; 
var $getUrl = true; 
var $getimg = "getimg.php?id="; 
var $dbuser = "root"; 
var $dbpassword = "whf76128"; 
var $dbname = "tech"; 
var $dbhost = "127.0.0.1"; 
 
function SaveWeb($title,$url,$typeid) //初始化, 
{ 
$this->title=$title; 
$this->url=$url; 
$this->typeid=$typeid; 
} 
function setContent($html) //初始化, 
{ 
$this->content = $html; 
$this->getUrl = false; 
} 
function saveContent() //直接存储段落文字 
{ 
$date = gmdate("Y-m-d"); 
$data = nl2br($this->content); 
$data = addslashes($data); 
MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword); 
mysql_select_db( $this->dbname);  
$result=MYSQL_QUERY( "INSERT INTO article (title,content,url,joindate,articletype) VALUES ('$this->title','$data','$this->url','$date',$this->typeid)");  
$id= mysql_insert_id(); 
MYSQL_CLOSE(); 
return $id; 
} 
function webSave() //存储页面 
{ 
if($this->title==""||$this->url=="") 
return false; 
if($this->getUrl==true) 
$text = $this->getHtml($this->url); 
else 
{ 
$text = $this->content; 
} 
$text2 = $this->parserHtml($text); 
$id = $this->saveHtml($text2); 
$this->updateImgPID($id,$this->title); 
$this->delimg(); 
return $id; 
} 
 
//在$strobj中查找$strchild,返回值为位置(找到)和false(没有找到相应的字符串).  
function strfind($strobj,$strchild,$int)  
{  
$intobj=strlen($strobj);  
$intchild=strlen($strchild); 
 
while($int<=$intobj)  
{  
if(strtolower(substr($strobj,$int,1))==$strchild[0]) //当从$strobj上截取的首字符与$strchild的首字符相同时,作进一步判断.  
{  
if(strtolower(substr($strobj,$int,$intchild))==$strchild)  
return $int; 
} 
$int++;  
}  
return false; 
} 
 
function getHtml($url) 
{ 
if(($fp = fopen($url,"r"))==false)  
{ 
echo "<font color=red>读取失败,文件位置:$url</font><br>"; 
return false; 
} 
 
$data = ""; 
while(!feof($fp)) 
{  
$data = $data.fread($fp,512); 
} 
fclose($fp); 
return $data; 
} 
function delImg() 
{ 
MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword); 
mysql_select_db( $this->dbname);  
$result=MYSQL_QUERY( "delete from images where articleid = 0");  
MYSQL_CLOSE(); 
} 
function updateImgPID($id,$title) 
{ 
MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword); 
mysql_select_db( $this->dbname);  
MYSQL_QUERY( "update images set articleid = $id where title='$title'"); 
MYSQL_CLOSE(); 
} 
 
function saveHtml($data) 
{ 
$date = gmdate("Y-m-d"); 
$data = addslashes($data); 
MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword); 
mysql_select_db( $this->dbname);  
$result=MYSQL_QUERY( "INSERT INTO article (title,content,url,joindate,articletype) VALUES ('$this->title','$data','$this->url','$date',$this->typeid)");  
$id= mysql_insert_id(); 
MYSQL_CLOSE(); 
return $id; 
} 
 
function saveImg($url) 
{ 
$data = $this->getHtml($url); 
$data = addslashes($data); 
MYSQL_CONNECT( $this->dbhost, $this->dbuser, $this->dbpassword); 
mysql_select_db( $this->dbname);  
$result=MYSQL_QUERY( "INSERT INTO images (bin_data,filetype,title,articleid) VALUES ('$data','".$this->getContentType($url)."','$this->title',0)");  
$id= mysql_insert_id(); 
MYSQL_CLOSE(); 
return $id; 
} 
 
function getContentName($inFileName) 
{ 
return basename($inFileName);  
} 
function getContentType($inFileName) 
{  
//--剥去路径 
$inFileName = basename($inFileName);  
//--检查文件扩展名  
if(strrchr($inFileName, ".") == false) 
{  
return "application/octet-stream";  
}  
//--得到文件扩展名,并判断文件类型 
$extension = strrchr($inFileName, ".");  
switch($extension) 
{  
case ".gif": return "image/gif";  
case ".gz": return "application/x-gzip";  
case ".htm": return "text/html";  
case ".html": return "text/html";  
case ".jpg": return "image/jpeg";  
case ".tar": return "application/x-tar";  
case ".txt": return "text/plain";  
case ".zip": return "application/zip";  
case ".png": return "image/png"; 
case ".bmp": return "image/bmp"; 
default: return "application/octet-stream";  
}  
return "application/octet-stream";  
}  
 
function parserHtml($text) 
{ 
$int = 0; 
$baseUrl = parse_url($this->url); 
$urlHost = "http://".$baseUrl["host"]; 
$urlDir = $urlHost.dirname($baseUrl["path"]); 
$urlDir = str_replace("//","/",$urlDir); 
//更新<img>标签 
while($int = $this->strfind($text,"<img",$int)) 
{  
$closeCharPos = $this->strfind($text,">",$int); 
$tmpTxt = substr($text,$int,$closeCharPos-$int+1); 
$srcStart = $this->strfind($tmpTxt,"src=",0);  
$srcEnd = 0; 
switch(substr($tmpTxt,$srcStart+4,1)) 
{ 
case '"': 
$srcEnd = $this->strfind($tmpTxt,'"',$srcStart+5);  
$imgUrl = substr($tmpTxt,$srcStart+5,$srcEnd-$srcStart-5);  
break; 
case "'": 
$srcEnd = $this->strfind($tmpTxt,"'",$srcStart+5); 
$imgUrl = substr($tmpTxt,$srcStart+5,$srcEnd-$srcStart-5);  
break; 
default: 
$srcEnd = $this->strfind($tmpTxt," ",$srcStart+4); 
if($srcEnd == false) 
$srcEnd = $this->strfind($tmpTxt,'>',$srcStart+4);  
$imgUrl = substr($tmpTxt,$srcStart+4,$srcEnd-$srcStart-4);  
}  
$tempImgUrl = $imgUrl; 
$tempFile = parse_url($this->getimg); 
 
 
if($this->strfind($tmpTxt,"http://",0)!=true) 
{ 
switch(substr($imgUrl,0,1)) 
{ 
case "/": 
$imgUrl = $urlHost.$imgUrl; 
break; 
default: 
if(substr($urlDir,strlen($urlDir)-1,1)=="/") 
$imgUrl = $urlDir.$imgUrl; 
else 
$imgUrl = $urlDir."/".$imgUrl; 
} 
}  
 
if($this->strfind($imgUrl,$tempFile["path"],0)!=false) 
{ 
$int++; 
continue; 
} 
$id = $this->saveImg($imgUrl);  
if($id == false) 
{ 
$int++; 
continue; 
} 
$newImgUrl = $this->getimg.$id; 
$text = str_replace($tempImgUrl,$newImgUrl,$text);  
$int++;  
} 
$int = 0; 
//更新<a></a>标签 
while($int = $this->strfind($text,"<a",$int)) 
{  
$closeCharPos = $this->strfind($text,">",$int); 
$tmpTxt = substr($text,$int,$closeCharPos-$int+1); 
$srcStart = $this->strfind($tmpTxt,"href=",0);  
$srcEnd = 0; 
switch(substr($tmpTxt,$srcStart+5,1)) 
{ 
case '"': 
$srcEnd = $this->strfind($tmpTxt,'"',$srcStart+6);  
$imgUrl = substr($tmpTxt,$srcStart+6,$srcEnd-$srcStart-6);  
break; 
case "'": 
$srcEnd = $this->strfind($tmpTxt,"'",$srcStart+6); 
$imgUrl = substr($tmpTxt,$srcStart+6,$srcEnd-$srcStart-6);  
break; 
default: 
$srcEnd = $this->strfind($tmpTxt," ",$srcStart+5); 
if($srcEnd == false) 
$srcEnd = $this->strfind($tmpTxt,'>',$srcStart+5);  
$imgUrl = substr($tmpTxt,$srcStart+5,$srcEnd-$srcStart-5);  
}  
$tempImgUrl = $imgUrl;  
if($this->strfind($tmpTxt,"http://",0)!=true) 
{ 
switch(substr($imgUrl,0,1)) 
{ 
case "/": 
$imgUrl = $urlHost.$imgUrl; 
break; 
default: 
if(substr($urlDir,strlen($urlDir)-1,1)=="/") 
$imgUrl = $urlDir.$imgUrl; 
else 
$imgUrl = $urlDir."/".$imgUrl; 
} 
$text = str_replace($tempImgUrl,$imgUrl,$text); 
}  
$int++;  
} 
return $text; 
} 
} 
 
 
?> 
 
 |  
  |   
 
 
 
 |