当前位置 博文首页 > 使用HtmlAgilityPack XPath 表达式抓取博客园数据的实现代码

    使用HtmlAgilityPack XPath 表达式抓取博客园数据的实现代码

    作者:admin 时间:2021-09-08 19:10


    Web 前端代码

    复制代码 代码如下:

    <%@ Page Language="C#" AutoEventWireup="true" CodeFile="Default.aspx.cs" Inherits="_Default" %>
    <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
    <html xmlns="http://www.w3.org/1999/xhtml">
    <head runat="server">
    <title></title>
    </head>
    <body>
    <form runat="server">
    <div>
    <table cellpadding="1" cellspacing="1" bgcolor="#f1f1f1" style="text-align: center">
    <asp:Repeater ID="Repeater1" runat="server">
    <HeaderTemplate>
    <tr>
    <td>
    标题
    </td>
    <td>
    发布作者
    </td>
    <td>
    发布时间
    </td>
    </tr>
    </HeaderTemplate>
    <ItemTemplate>
    <tr bgcolor="#ffffff">
    <td align="left">
    <a href='<%#Eval("url") %>' target="_blank">
    <%#Eval("title") %>
    </a>
    </td>
    <td>
    <a href='<%#Eval("authorUrl") %>' target="_blank">
    <%#Eval("author") %>
    </a>
    </td>
    <td>
    <%#Eval("updatetime") %>
    </td>
    </tr>
    </ItemTemplate>
    </asp:Repeater>
    </table>
    </div>
    </form>
    </body>
    </html>

    cs 后台代码:
    复制代码 代码如下:

    using System;
    using System.Collections.Generic;
    using System.Linq;
    using System.Web;
    using System.Web.UI;
    using System.Web.UI.WebControls;
    using S1;
    using System.Net;
    using System.IO;
    using System.Text;
    using HtmlAgilityPack;
    public partial class _Default : System.Web.UI.Page
    {
    protected void Page_Load(object sender, EventArgs e)
    {
    string page = string.Empty;
    if (!IsPostBack)
    {
    WebClient wc = new WebClient();
    string address = "http://www.cnblogs.com";
    if (!string.IsNullOrEmpty(Request.QueryString["p"]))
    {
    address += "/" + Request.QueryString["p"];//分页,p=p2,p=p3
    }
    Stream stream = wc.OpenRead(address);
    StreamReader sr = new StreamReader(stream, Encoding.UTF8);
    string html = sr.ReadToEnd();
    //实例化HtmlAgilityPack.HtmlDocument对象
    HtmlDocument doc = new HtmlDocument();
    //载入HTML
    doc.LoadHtml(html);
    //根据HTML节点NODE的ID获取节点
    HtmlNode navNode = doc.GetElementbyId("post_list");
    //div[2]表示文章链接a位于post_list里面第3个div节点中
    HtmlNodeCollection list = navNode.SelectNodes("//div[2]/h3/a"); //根据XPATH来索引节点
    Cnblogs cnblogs = null;
    IList<Cnblogs> cnlist = new List<Cnblogs>();
    foreach (HtmlNode node in list)
    {
    cnblogs = new Cnblogs();
    //获取文章链接地址
    cnblogs.url = node.Attributes["href"].Value.ToString();
    //获取文章标题
    cnblogs.title = node.InnerText;
    cnlist.Add(cnblogs);
    }
    HtmlNodeCollection list1 = navNode.SelectNodes("//div[2]/div/a");
    for (int i = 0; i < cnlist.Count; i++)
    {
    cnlist[i].author = list1[i].InnerText;
    cnlist[i].authorUrl = list1[i].Attributes["href"].Value.ToString();
    cnlist[i].updatetime = list1[i].NextSibling.InnerText.Replace("发布于", "").Trim();
    }
    this.Repeater1.DataSource = cnlist;
    this.Repeater1.DataBind();
    }
    }
    public class Cnblogs
    {
    public string title { get; set; }
    public string url { get; set; }
    public string author { get; set; }
    public string authorUrl { get; set; }
    public string updatetime { get; set; }
    }
    }

    jsjbwy
    下一篇:没有了