以前常听到别人做采集程序,今天俺也小试身手。。
using System;
using System.Collections;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Web;
using System.Web.SessionState;
using System.Web.UI;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
using System.IO;
using System.Data.SqlClient;
using System.Web.UI.WebControls;
using System.Web.UI.HtmlControls;
namespace Temp
{
/// summary
/// GetHtmlSourceFromUrl 的摘要说明。
/// /summary
public class GetHtmlSourceFromUrl : System.Web.UI.Page
{
private SqlConnection con;
private void Page_Load(object sender, System.EventArgs e)
{
// 在此处放置用户代码以初始化页面
if(!IsPostBack)
{
InsertToDB();
}
}
private void InsertToDB()
{
for(int i=1;i1000;i++)
{
string Html = string.Empty;
string SqlText="insert into BeiJingBus(BusLineNumber,Html) Values(@BLN,@Html)";
string ConnectionString=@"Server=.xxxx;User ID=xxxx;Pwd=xxxxx;DataBase=Map";
con =new SqlConnection(ConnectionString);
SqlCommand cmd = new SqlCommand(SqlText,con);
string Url=@"http://beijing.ibusdb.com/?busline="+i+"&s=busline&x=31&y=18"; // ^_^,这是取北京的 转到首页,换换参数,就可以取其它城市的了
string Content = string.Empty;
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
HttpWebResponse response =(HttpWebResponse) request.GetResponse();
StreamReader stream = new StreamReader(response.GetResponseStream(),Encoding.GetEncoding("gb2312"));
Content =stream.ReadToEnd();
stream.Close();
response.Close();
int start=3487; //去广告 取正文
int end= Content.Length-4222;//去广告 取正文
if(end-start0) //判断存在第i路公交
{
Html=Regex.Replace(Content.Substring(start,end-start),@"[^]+",""); //分离html代码
try
{
cmd.Parameters.Add("@BLN",SqlDbType.Int).Value =i;
cmd.Parameters.Add("@Html",SqlDbType.Text).Value= Html;
con.Open();
cmd.ExecuteNonQuery();
}
catch(SqlException err)
{
Response.Write(err.Message);
con.Close();
break;
}
con.Close();
}
}
}
--表sql脚本:
CREATE TABLE [dbo].[BeiJingBus](
[id] [int] IDENTITY(1,1) NOT NULL,
[BusLineNumber] [int] NULL,
[Html] [nvarchar](max) COLLATE Chinese_PRC_CI_AS NULL
) ON [PRIMARY]
不会用签名:http://hi.baidu.com/陈立/blog