如何用PHPjava爬虫抓取网页数据完整数据

PHP数据抓取中常用到的代理IP网站 - PHP,爬虫,数据抓取,代理ip - language - ITeye论坛
PHP数据抓取中常用到的代理IP网站
锁定老帖子
精华帖 (0) :: 良好帖 (0) :: 新手帖 (0) :: 隐藏帖 (0)
来自: 北京
发表时间:&&
www.data5u.com
www.goubanjia.com
ip.qiaodm.com
跳转论坛:移动开发技术
Web前端技术
Java企业应用
编程语言技术实例讲解ASP实现抓取网上房产信息_小偷/采集
&%@LANGUAGE="VBSCRIPT" CODEPAGE="936"%& &!-- #include file="conn.ASP" --&
&!-- #include file="inc/function.asp" --& &!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/4/loose.dtd"& &html& &head& &title&Untitled Document&/title& &meta http-equiv="Content-Type" content="text/ charset=gb2312"& &meta http-equiv="refresh" content="300;URL=steal_house.asp"& &/head& &body& &% on error resume next '
Server.ScriptTimeout = 999999 '======================================================== '字符编码函数 '==================================================== Function BytesToBstr(body,code)
dim objstream
set objstream = Server.createObject("adodb.stream")
objstream.Type = 1
objstream.Mode =3
objstream.Open
objstream.Write body
objstream.Position = 0
objstream.Type = 2
objstream.Charset =code BytesToBstr = objstream.ReadText
objstream.Close
set objstream = nothing
End Function
'取行字符串在另一字符串中的出现位置 Function Newstring(wstr,strng)
Newstring=Instr(lcase(wstr),lcase(strng))
if Newstring&=0 then Newstring=Len(wstr)
End Function
'替换字符串函数 function ReplaceStr(ori,str1,str2) ReplaceStr=replace(ori,str1,str2) end function '==================================================== function ReadXML(url,code,start,ends) set oSend=createobject("Microsoft.XMLHTTP") SourceCode = oSend.open ("GET",url,false)
oSend.send() ReadXml=BytesToBstr(oSend.responseBody,code ) start=Instr(ReadXml,start) ReadXml=mid(ReadXml,start) ends=Instr(ReadXml,ends) ReadXml=left(ReadXml,ends-1) end function function SubStr(body,start,ends) start=Instr(body,start) SubStr=mid(body,start+len(start)+1) ends=Instr(SubStr,ends) SubStr=left(SubStr,ends-1) end function dim getcont,NewsContent dim url,title url="http://www.***.com"'新闻网址knowsky.com getcont=ReadXml(url,"gb2312","&table class=k2 border=""0""","&/table&") getcont=RegexHtml(getcont) dim KeyId,NewsClass,City,Position,HouseType,Level,Area,Price,Demostra dim ContactMan,Contact for i=2 to ubound(getcont) response.Write(getcont(i)&"__&br&") tempLink=mid(getcont(i),instr(getcont(i),"href=""")+6,instr(getcont(i),""" onClick")-10) tempLink=replace(tempLink,"../","") response.Write(i&":"&tempLink&"&br&") NewsContent=ReadXml(tempLink,"gb2312","&td valign=""bottom"" width=""400""&","&hr width=""760"" noshade size=""1"" color=""#808080""& ") NewsContent=RemoveHtml(NewsContent) NewsContent=replace(NewsContent,VbCrLf,"") NewsContent=replace(NewsContent,vbNewLine,"") NewsContent=replace(NewsContent," ","") NewsContent=replace(NewsContent," ","") NewsContent=replace(NewsContent," ","")
NewsContent=replace(NewsContent,"\n","")
NewsContent=replace(NewsContent,chr(10),"") NewsContent=replace(NewsContent,chr(13),"") '===============get Content======================= response.Write(NewsContent) KeyId=SubStr(NewsContent,"列号:","信息类别:") NewsClass=SubStr(NewsContent,"类别:","所在城市:") City=SubStr(NewsContent,"城市:","房屋具体位置:") Position=SubStr(NewsContent,"位置:","房屋类型:") HouseType=SubStr(NewsContent,"类型:","楼层:") Level=SubStr(NewsContent,"楼层:","使用面积:") Area=SubStr(NewsContent,"面积:","房价:") Price=SubStr(NewsContent,"房价:","其他说明:") Demostra=SubStr(NewsContent,"说明:","联系人:") ContactMan=SubStr(NewsContent,"联系人:","联系方式:") Contact=SubStr(NewsContent,"联系方式:","信息来源:")
response.Write("总序列号:"&KeyId&"&br&") response.Write("信息类别:"&NewsClass&"&br&") response.Write("所在城市:"&City&"&br&") response.Write("房屋具体位置:"&Position&"&br&") response.Write("房屋类型:"&HouseType&"&br&") response.Write("楼层:"&Level&"&br&") response.Write("使用面积:"&Area&"&br&") response.Write("房价:"&Price&"&br&") response.Write("其他说明:"&Demostra&"&br&") response.Write("联系人:"&ContactMan&"&br&") response.Write("联系方式:"&Contact&"&br&") 'title=RemoveHTML(aa(i)) 'response.Write("title:"&title) for n=0 to application.Contents.count if(application.Contents(n)=KeyId) then ifexit=true
if not ifexit then application(time&i)=KeyId '添加到数据库 '==================================================== set rs=server.createObject("adodb.recordset")
rs.open "select top 1 * from news order by id desc",conn,3,3 rs.addnew rs("NewsClass")=NewsClass rs("City")=City rs("Position")=Position rs("HouseType")=HouseType rs("Level")=Level rs("Area")=Area rs("Price")=Price rs("Demostra")=Demostra rs("ContactMan")=ContactMan rs("Contact")=Contact rs.update rs.close set rs=nothing end if '================================================== next function RemoveTag(body) Set regEx = New RegExp regEx.Pattern = "&[a].*?&\/[a]&" regEx.IgnoreCase = True regEx.Global = True Set Matches = regEx.Execute(body)
dim i,arr(15),ifexit i=0 j=0 For Each Match in Matches TempStr = Match.Value
TempStr=replace(TempStr,"&td&","") TempStr=replace(TempStr,"&/td&","") TempStr=replace(TempStr,"&tr&","") TempStr=replace(TempStr,"&/tr&","")
arr(i)=TempStr
i=i+1 if(i&=15) then exit for end if Next Set regEx=nothing Set Matches =nothing RemoveTag=arr end function function RegexHtml(body) dim r_arr(47),r_temp Set regEx2 = New RegExp regEx2.Pattern ="&a.*?&\/a&" regEx2.IgnoreCase = True regEx2.Global = True Set Matches2 = regEx2.Execute(body)
For Each Match in Matches2 r_arr(iii)=Match.Value iii=iii+1
Next RegexHtml=r_arr set regEx2=nothing set Matches2=nothing end function '====================================================== conn.close set conn=nothing %& &/body& &/html&   function.asp &% '************************************************** '函数名:gotTopic '作 用:截字符串,汉字一个算两个字符,英文算一个字符 '参 数:str ----原字符串 ' strlen ----截取长度 '返回值:截取后的字符串 '************************************************** function gotTopic(str,strlen) if str="" then gotTopic="" exit function end if dim l,t,c, i str=replace(replace(replace(replace(str," "," "),""",chr(34)),"&","&"),"&","&") str=replace(str,"?","") l=len(str) t=0 for i=1 to l c=Abs(Asc(Mid(str,i,1))) if c&255 then t=t+2 else t=t+1 end if if t&=strlen then gotTopic=left(str,i) & "…" exit for else gotTopic=str end if next gotTopic=replace(replace(replace(replace(gotTopic," "," "),chr(34),"""),"&","&"),"&","&") end function '========================================================= '函数:RemoveHTML(strHTML) '功能:去除HTML标记 '参数:strHTML --要去除HTML标记的字符串 '========================================================= Function RemoveHTML(strHTML)
Dim objRegExp, Match, Matches
Set objRegExp = New Regexp
objRegExp.IgnoreCase = True
objRegExp.Global = True
'取闭合的&&
objRegExp.Pattern = "&.+?&"
Set Matches = objRegExp.Execute(strHTML)
' 遍历匹配集合,并替换掉匹配的项目
For Each Match in Matches
strHtml=Replace(strHTML,Match.Value,"")
RemoveHTML=strHTML
Set objRegExp = Nothing
set Matches=nothing End Function
%&   conn.asp &% 'on error resume next set conn=server.createObject("adodb.connection")
con= "driver={Microsoft Access Driver (*.mdb)};dbq=" & Server.MapPath("stest.mdb")
conn.open con sub connclose
conn.close set conn=nothing
end sub %&php中抓取网页内容的实例详解
转载 & & 投稿:lqh
这篇文章主要介绍了php中抓取网页内容的实例详解的相关资料,这里提供两种实现方法,希望能帮助到大家,需要的朋友可以参考下
php中抓取网页内容的实例详解
使用file_get_contents方法实现
$url = "http://news.sina.com.cn/c/nd//doc-ifxwztru6951143.shtml";
$html = file_get_contents($url);
//如果出现中文乱码使用下面代码
//$getcontent = iconv("gb2312", "utf-8",$html);
echo "&textarea style='width:800height:600'&".$html."&/textarea&";
代码很简单,一看就懂,不解释了。
使用curl实现
$url = "http://news.sina.com.cn/c/nd//doc-ifxwztru6951143.shtml";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
$html = curl_exec($ch);
curl_close($ch);
echo "&textarea style='width:800height:600'&".$html."&/textarea&";
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
加上这句代码,表示如果请求被重定向时,可以访问到最终的请求页面,不然请求的结果会显示如下内容:
&head&&title&Object moved&/title&&/head&
&body&&h1&Object Moved&/h1&This object may be found &a href="some link." rel="external nofollow" &here&/a&.&/body&
如意疑问请留言或者到本站社区交流讨论,感谢阅读,希望能帮助到大家,谢谢大家对本站的支持!
您可能感兴趣的文章:
大家感兴趣的内容
12345678910
最近更新的内容
常用在线小工具通用的PHP实现爬虫网页抓取数据源码下载
1、请不要问“在不在”之类的问题,有问题直接问!2、学生或暂时没有工作的童鞋,99元整站资源免费下载!3、充值VIP会员,整站资源无任何限制下载!5、程序员加油,交流群:
下载资源:
关注微信号获取此Demo视频教程:
下载积分:
通用的PHP实现爬虫网页抓取数据源码下载
友情提示:垃圾评论一律封号,下载出错或任何技术问题请联系QQ
通用的PHP实现爬虫网页抓取数据源码下载
所需积分:20
亲,积分不够,多去发布资源、评论、签到、
或联系QQ人工充值
用户名/邮箱
两周内免登录
注册即送5积分}

我要回帖

更多关于 java爬虫抓取网页数据 的文章

更多推荐

版权声明:文章内容来源于网络,版权归原作者所有,如有侵权请点击这里与我们联系,我们将及时删除。

点击添加站长微信