C#解析HTML的两种方法

04-25 09:43 6608浏览

举报 T字号

大字

中字

小字

在搜索引擎的开发中，我们需要对网页的的HTML内容进行检索，难免的就需要对的Html进行解析。每拆分一个节点并且节点电子杂志间的内容的英文解析HTML文件的主要内容。我们下面介绍来两种C＃解析Html的方法。

第一种方法：

用System.Net.WebClient下载网页保存到本地文件或字符串中，用正则表达式来分析。这个方法可以用在Web Crawler等需要分析很多网页的应用中。

估计这也是大家最直接，最容易想到的一个方法。

转自网上的一个实例：所有的href都撤出：

using System;
using System.Net;
using System.Text;
using System.Text.RegularExpressions;
namespace HttpGet
{
class Class1
{
[STAThread]
static void Main(string[]args)
{
System.Net.WebClient client=new WebClient();
byte[]page=client.DownloadData("http://www.google.com");
string content=System.Text.Encoding.UTF8.GetString(page);
string regex="href=[\\\"\\\'](http:\\/\\/|\\.\\/|\\/)?\\w+(\\.\\w+)*(\\/\\w+(\\.\\w+)?)*(\\/|\\?\\w*=\\w*(&\\w*=\\w*)*)?[\\\"\\\']";
Regex re=new Regex(regex);
MatchCollection matches=re.Matches(content);
System.Collections.IEnumerator enu=matches.GetEnumerator();
while(enu.MoveNext()&&enu.Current!=null)
{
Match match=(Match)(enu.Current);
Console.Write(match.Value+"\r\n");
}
}
}
}

一些爬虫的HTML解析中也是用的类似的方法。

第二种方法：

利用Winista.Htmlparser.Net解析Html。这是.NET平台下解析Html的开源代码，网上有源码下载，百度一下就能搜到，这里就不提供了。并且有英文的帮助文档。找不到的留下邮箱。

个人认为这是.net平台下解析html不错的解决方案，基本上能够满足我们对html的解析工作。

自己做了个实例：

using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.Linq;
using System.Text;
using System.Windows.Forms;
using Winista.Text.HtmlParser;
using Winista.Text.HtmlParser.Lex;
using Winista.Text.HtmlParser.Util;
using Winista.Text.HtmlParser.Tags;
using Winista.Text.HtmlParser.Filters;
namespace HTMLParser
{
public partial class Form1:Form
{
public Form1()
{
InitializeComponent();
AddUrl();
}
private void btnParser_Click(object sender,EventArgs e)
{
#region获得网页的html
try
{
txtHtmlWhole.Text="";
string url=CBUrl.SelectedItem.ToString().Trim();
System.Net.WebClient aWebClient=new System.Net.WebClient();
aWebClient.Encoding=System.Text.Encoding.Default;
string html=aWebClient.DownloadString(url);
txtHtmlWhole.Text=html;
}
catch(Exception ex)
{
MessageBox.Show(ex.Message);
}
#endregion
#region分析网页html节点
Lexer lexer=new Lexer(this.txtHtmlWhole.Text);
Parser parser=new Parser(lexer);
NodeList htmlNodes=parser.Parse(null);
this.treeView1.Nodes.Clear();
this.treeView1.Nodes.Add("root");
TreeNode treeRoot=this.treeView1.Nodes[0];
for(int i=0;i<htmlNodes.Count;i++)
{
this.RecursionHtmlNode(treeRoot,htmlNodes[i],false);
}
#endregion
}
private void RecursionHtmlNode(TreeNode treeNode,INode htmlNode,bool siblingRequired)
{
if(htmlNode==null||treeNode==null)return;
TreeNode current=treeNode;
TreeNode content;
//current node
if(htmlNode is ITag)
{
ITag tag=(htmlNode as ITag);
if(!tag.IsEndTag())
{
string nodeString=tag.TagName;
if(tag.Attributes!=null&&tag.Attributes.Count>0)
{
if(tag.Attributes["ID"]!=null)
{
nodeString=nodeString+"{id=\""+tag.Attributes["ID"].ToString()+"\"}";
}
if(tag.Attributes["HREF"]!=null)
{
nodeString=nodeString+"{href=\""+tag.Attributes["HREF"].ToString()+"\"}";
}
}
current=new TreeNode(nodeString);
treeNode.Nodes.Add(current);
}
}
//获取节点间的内容
if(htmlNode.Children!=null&&htmlNode.Children.Count>0)
{
this.RecursionHtmlNode(current,htmlNode.FirstChild,true);
content=new TreeNode(htmlNode.FirstChild.GetText());
treeNode.Nodes.Add(content);
}
//the sibling nodes
if(siblingRequired)
{
INode sibling=htmlNode.NextSibling;
while(sibling!=null)
{
this.RecursionHtmlNode(treeNode,sibling,false);
sibling=sibling.NextSibling;
}
}
}
private void AddUrl()
{
CBUrl.Items.Add("http://www.hao123.com");
CBUrl.Items.Add("http://www.sina.com");
CBUrl.Items.Add("http://www.heuet.edu.cn");
}
}
}

以上就是C#解析HTML的两种方法，我们通过分析源码可以了解到解析HTML的详细步骤，只有我们按部就班地按照步骤来进行解析，基本上就能够成功解析HTML了。更多HTML视频教程请在动力节点在线免费观看，源码资料免费下载。

0人推荐

共同学习，写下你的评论

0条评论

代码小兵988

67篇文章贡献228982字

C#解析HTML的两种方法

第一种方法：

第二种方法：

相关课程更多>

作者相关文章更多>

推荐相关文章更多>

发评论

举报

C#解析HTML的两种方法

第一种方法：

第二种方法：

相关课程 更多>

作者相关文章更多>

推荐相关文章更多>

发评论

举报

相关课程更多>