asp.net(c#)做一个网页数据采集工具

更新时间：2009年12月16日 22:56:40 作者：

最近做一个网站，该网站需要添加4000多产品信息，如果用人工方法去别的网站copy那至少要花费半月时间才能完成，所以我个办法使用c#作出来了一个网页数据采集软件.

通过这个软件一两天就完成了几千产品数据的录入，可见很多工作不是一味用人工去做，作为一个程序员，就是要让很多让那些经常做重复性的、繁琐的工作中的人解放出来。下面只是写了一些核心代码，而且采集必须要和对应网站相挂钩，作者：郑少群

 
//提取产品列表页中产品最终页的网页 
private void button1_Click(object sender, EventArgs e) 
{ 
if (textBox1.Text.Trim() == "" || textBox2.Text.Trim() == "") 
{ 
MessageBox.Show("网址和域名不能为空！", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information); 
return; 
} 
try 
{ 
string Html = inc.GetHtml("http://study.pctoday.net.cn"); 
//ArrayList al = inc.GetMatchesStr(Html, "<a[^>]*?>.*?</a>"); 
ArrayList al = inc.GetMatchesStr(Html, @"href\s*=\s*(?:[\'\""\s](?<1>[^\""\']*)[\'\""])");//提取链接 


" title="Replica Watches:">Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, Cartier Watches online Sale! 
StringBuilder sb = new StringBuilder(); 
foreach (object var in al) 
{ 
string a = var.ToString().Replace("\"", "").Replace("'", ""); 
a = Regex.Replace(a, "href=", "", RegexOptions.IgnoreCase | RegexOptions.Multiline); 
if (a.StartsWith("/")) 
a = textBox2.Text.Trim() + a; 
if (!a.StartsWith("http://")) 
a = "http://" + a; 
sb.Append(a + "\r\n"); 
} 
textBox5.Text = sb.ToString();//把提取到网址输出到一个textBox，每个链接占一行 



MessageBox.Show("共提取" + al.Count.ToString() + "个链接", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information); 

} 
catch (Exception err) 
{ 
MessageBox.Show("提取出错！原因：" + err.Message, "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information); 
} 

} 




//把采集的产品页面html代码进行字符串处理，提取需要的代码，最后保存到本地一个access数据库中，同时提取产品图片地址并自动现在图片到本地images文件夹下 

private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e) 
{ 
//填充产品表 
Database.ExecuteNonQuery("delete from Tb_Product"); 
DataTable dt2 = new DataTable(); 
OleDbConnection conn = new OleDbConnection(Database.ConnectionStrings); 
OleDbDataAdapter da = new OleDbDataAdapter("select * from Tb_Product", conn); 
OleDbCommandBuilder cb = new OleDbCommandBuilder(da); 
da.Fill(dt2); 
dt2.Rows.Clear(); 

BackgroundWorker worker = (BackgroundWorker)sender;//这个是做一个进度条 

string[] Urls = textBox5.Text.Trim().ToLower().Replace("\r\n", ",").Split(','); 
DataTable dt = new DataTable(); 
StringBuilder ErrorStr = new StringBuilder(); 
string html = "", ImageDir = AppDomain.CurrentDomain.BaseDirectory + "Images\\"; 

//循环每次采集网址 
for (int i = 0; i < Urls.Length; i++) 
{ 
try 
{ 
if (!worker.CancellationPending) 
{ 
if (Urls[i] == "") 
return; 
html = inc.GetHtml(Urls[i]);//获取该url的html代码 
DataRow NewRow = dt2.NewRow(); 

//产品名 
string ProductName = html.Substring(html.IndexOf("<title>") + 7); 
NewRow["ProductName"] = ProductName.Remove(ProductName.IndexOf("</title>")).Trim(); 

//产品编号 
NewRow["ModelId"] = NewRow["ProductName"].ToString().Substring(NewRow["ProductName"].ToString().IndexOf("Model:") + 6).Trim(); 

//产品介绍，这些都是根据不同网站的html做相应的修改 
string Introduce = html.Substring(html.IndexOf("Product Details") + 26); 
Introduce = Introduce.Remove(Introduce.IndexOf("</table>") + 8).Trim() 

NewRow["Introduce"] = Introduce; 



" title="Replica Watches:">Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, Cartier Watches online Sale! 
//下载图片 
string ProductImage = html.Substring(html.IndexOf("align=center><img") + 17); 
ProductImage = textBox2.Text.Trim() + ProductImage.Substring(ProductImage.IndexOf("src=\"") + 5); 
ProductImage = ProductImage.Remove(ProductImage.IndexOf("\"")); 
try 
{ 
inc.DownFile(ProductImage, ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1)); 
} 
catch (Exception) 
{ 
ErrorStr.Append("下载图片失败，图片地址：" + ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1) + "\r\n"); 
} 


dt2.Rows.Add(NewRow); 

//Thread.Sleep(100); 
worker.ReportProgress((i + 1) * 100 / Urls.Length, i); 
toolStripStatusLabel1.Text = "处理进度:" + (i + 1).ToString() + "/" + Urls.Length.ToString();//进度条 
} 

} 
catch (Exception err) 
{ 
ErrorStr.Append("采集错误：" + err.Message + ";网址：" + Urls[i] + "\r\n"); 
} 
} 
da.Update(dt2); 
DataBind(dt2); 
ShowError(ErrorStr.ToString()); 
} 

/// <summary> 
/// ASPX页面生成静态Html页面，作者：郑少群 
/// </summary> 
public static string GetHtml(string url) 
{ 
StreamReader sr = null; 
string str = null; 
//读取远程路径 
WebRequest request = WebRequest.Create(url); 
HttpWebResponse response = (HttpWebResponse)request.GetResponse(); 
sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(response.CharacterSet)); 
str = sr.ReadToEnd(); 
sr.Close(); 
return str; 
} 


// 提取HTML代码中的网址 
public static ArrayList GetMatchesStr(string htmlCode, string strRegex) 
{ 
ArrayList al = new ArrayList(); 

Regex r = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline); 
MatchCollection m = r.Matches(htmlCode); 

for (int i = 0; i < m.Count; i++) 
{ 
bool rep = false; 
string strNew = m[i].ToString(); 

// 过滤重复的URL 
foreach (string str in al) 
{ 
if (strNew == str) 
{ 
rep = true; 
break; 
} 
} 

if (!rep) al.Add(strNew); 
} 

al.Sort(); 

return al; 
} 

public static void DownFile(string Url, string Path) 
{ 

HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url); 
HttpWebResponse response = (HttpWebResponse)request.GetResponse(); 
Stream stream = response.GetResponseStream(); 
long size = response.ContentLength; 
//创建文件流对象 
using (FileStream fs = new FileStream(Path, FileMode.OpenOrCreate, FileAccess.Write)) 
{ 
byte[] b = new byte[1025]; 
int n = 0; 
while ((n = stream.Read(b, 0, 1024)) > 0) 
{ 
fs.Write(b, 0, n); 
} 
} 
} 

您可能感兴趣的文章:

DropDownList获取的SelectIndex一直为0的问题
由于初始化判断出错导致每次传到服务器的时候会初始化一次，这就导致每次获取DropDownList的SelectIndex的时候只能是0
2014-06-06
asp.net通过js实现Cookie创建以及清除Cookie数组的代码
asp.net Cookie创建以及清除Cookie数组
2010-03-03
.NET 6开发TodoList应用之实现Repository模式
这篇文章主要介绍了如何实现一个可重用的Repository模块。文中的示例代码讲解详细，对我们学习或工作有一定的帮助，感兴趣的小伙伴可以跟随小编一起学习一下
2021-12-12
asp.net中使用DatagridView的增删改方法具体实现
asp.net中使用DatagridView的增删改方法具体实现，需要的朋友可以参考一下
2013-06-06
充分利用ASP.NET的三种缓存提高站点性能的注意方法
充分利用ASP.NET的三种缓存提高站点性能的注意方法...
2007-09-09
ASP.NET MVC中使用jQuery时的浏览器缓存问题详解
这篇文章主要介绍了ASP.NET MVC中使用jQuery时的浏览器缓存问题详解,需要的朋友可以参考下。
2016-06-06
防止在服务器处理完成之前用户多次点击提交按钮处理代码
在提交表单时，如果网页速度过慢或者其他原因，用户多次提交能导致数据的修改，怎么解决这个问题呢，接下来将为您解决这个问题，需要的朋友可以了解下
2012-12-12
ASP.NET MVC3 SEO优化：利用Routing特性提高站点权重
这篇文章主要介绍了ASP.NET MVC3 SEO优化：利用Routing特性消除多个路径指向同一个Action,从而提高站点权重,需要的朋友可以参考下。
2016-06-06
浅谈ASP.NET的include的使用方法
include：这是今天的主题，我想没有什么比一个UI更说明问题了，那么这是一个什么页面呢？详见下面。
2013-03-03
ASP.NET中 Execl导出的六种方法实例
这篇文章主要介绍了ASP.NET中 Execl导出的六种方法实例，有需要的朋友可以参考一下
2013-12-12