C# CsQuery Dom操作帮助类及用法 - 实用 - lbx

1、准备工作

在管理NuGet程序包中搜索 CsQuery,安装 CsQuery 到要项目中。


2、基类

类代码如下:

public class HttpRequestClient
    {
        static HashSet<String> UNCHANGEHEADS = new HashSet<string>();
        static HttpRequestClient()
        {
            UNCHANGEHEADS.Add("Host");
            UNCHANGEHEADS.Add("Connection");
            UNCHANGEHEADS.Add("User-Agent");
            UNCHANGEHEADS.Add("Referer");
            UNCHANGEHEADS.Add("Range");
            UNCHANGEHEADS.Add("Content-Type");
            UNCHANGEHEADS.Add("Content-Length");
            UNCHANGEHEADS.Add("Expect");
            UNCHANGEHEADS.Add("Proxy-Connection");
            UNCHANGEHEADS.Add("If-Modified-Since");
            UNCHANGEHEADS.Add("Keep-alive");
            UNCHANGEHEADS.Add("Accept");

            ServicePointManager.DefaultConnectionLimit = 10000;//最大连接数
        }

        /// <summary>
        /// 默认的头
        /// </summary>
        public static string defaultHeaders = @"Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
Accept-Encoding:gzip, deflate, sdch
Accept-Language:zh-CN,zh;q=0.8
Cache-Control:no-cache
Connection:keep-alive
Pragma:no-cache
Upgrade-Insecure-Requests:1
User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36";

        /// <summary>
        /// 是否跟踪cookies
        /// </summary>
        bool isTrackCookies = false;
        /// <summary>
        /// cookies 字典
        /// </summary>
        Dictionary<String, Cookie> cookieDic = new Dictionary<string, Cookie>();

        /// <summary>
        /// 平均相应时间
        /// </summary>
        long avgResponseMilliseconds = -1;

        /// <summary>
        /// 平均相应时间
        /// </summary>
        public long AvgResponseMilliseconds
        {
            get
            {
                return avgResponseMilliseconds;
            }

            set
            {
                if (avgResponseMilliseconds != -1)
                {
                    avgResponseMilliseconds = value + avgResponseMilliseconds / 2;
                }
                else
                {
                    avgResponseMilliseconds = value;
                }

            }
        }

        public HttpRequestClient(bool isTrackCookies = false)
        {
            this.isTrackCookies = isTrackCookies;
        }

        /// <summary>
        /// http请求
        /// </summary>
        /// <param name="url"></param>
        /// <param name="method">POST,GET</param>
        /// <param name="headers">http的头部,直接拷贝谷歌请求的头部即可</param>
        /// <param name="content">content,每个key,value 都要UrlEncode才行</param>
        /// <param name="contentEncode">content的编码</param>
        /// <param name="proxyUrl">代理url</param>
        /// <returns></returns>
        public string http(string url, string method, string headers, string content, Encoding contentEncode, string proxyUrl,string cookiesHeader)
        {
            HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
            request.Method = method;
            if(method.Equals("GET",StringComparison.InvariantCultureIgnoreCase))
            {
                request.MaximumAutomaticRedirections = 100;
                request.AllowAutoRedirect = false;
            }

            fillHeaders(request, headers,true);
            fillProxy(request, proxyUrl);
            request.Headers[HttpRequestHeader.Cookie] = cookiesHeader;
            #region 添加Post 参数  
            if (contentEncode == null)
            {
                contentEncode = Encoding.UTF8;
            }
            if (!string.IsNullOrWhiteSpace(content))
            {
                byte[] data = contentEncode.GetBytes(content);
                request.ContentLength = data.Length;
                using (Stream reqStream = request.GetRequestStream())
                {
                    reqStream.Write(data, 0, data.Length);
                    reqStream.Close();
                }
            }
            #endregion

            HttpWebResponse response = null;
            System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
            try
            {
                sw.Start();
                response = (HttpWebResponse)request.GetResponse();
                sw.Stop();
                AvgResponseMilliseconds = sw.ElapsedMilliseconds;

                string cookieString = response.Headers[HttpResponseHeader.SetCookie];
                var cc=cookieString2Collection(cookieString);
                trackCookies(cc);
            }
            catch (Exception ex)
            {
                sw.Stop();
                System.Console.WriteLine("请求异常!");
                AvgResponseMilliseconds = sw.ElapsedMilliseconds;
                return "";
            }

            string result = getResponseBody(response);
            return result;
        }


        private CookieCollection cookieString2Collection(string cookieString)
        {
            CookieCollection cc = new CookieCollection();
            //string cookieString = response.Headers[HttpResponseHeader.SetCookie];
            if (!string.IsNullOrWhiteSpace(cookieString))
            {
                var spilit = cookieString.Split(';');
                foreach (string item in spilit)
                {
                    if (item.Equals("Path=/", StringComparison.InvariantCultureIgnoreCase))
                        continue;
                    var kv = item.Split('=');
                    if (kv.Length == 2)
                    {
                        if (kv[0].Trim().StartsWith("HttpOnly,", StringComparison.InvariantCultureIgnoreCase))
                        {
                            var cookie = new Cookie(kv[0].Trim().Remove(0, "HttpOnly,".Length), kv[1].Trim());
                            cc.Add(cookie);
                        }
                        else
                        {
                            var cookie = new Cookie(kv[0].Trim(), kv[1].Trim());
                            cc.Add(cookie);
                        }
                    }

                }
            }
            return cc;
        }



        /// <summary>
        /// post 请求
        /// </summary>
        /// <param name="url"></param>
        /// <param name="headers"></param>
        /// <param name="content"></param>
        /// <param name="contentEncode"></param>
        /// <param name="proxyUrl"></param>
        /// <returns></returns>
        public string httpPost(string url, string headers, string content, Encoding contentEncode, string proxyUrl = null, string cookiesHeader = null)
        {
            return http(url, "POST", headers, content, contentEncode, proxyUrl, cookiesHeader);
        }

        /// <summary>
        /// get 请求
        /// </summary>
        /// <param name="url"></param>
        /// <param name="headers"></param>
        /// <param name="content"></param>
        /// <param name="proxyUrl"></param>
        /// <returns></returns>
        public string httpGet(string url, string headers, string content = null, string proxyUrl = null, string cookiesHeader = null)
        {
            return http(url, "GET", headers, null, null, proxyUrl, cookiesHeader);
        }

        /// <summary>
        /// 填充代理
        /// </summary>
        /// <param name="proxyUri"></param>
        private void fillProxy(HttpWebRequest request, string proxyUri)
        {
            if (!string.IsNullOrWhiteSpace(proxyUri))
            {
                WebProxy proxy = new WebProxy();
                proxy.Address = new Uri(proxyUri);
                proxy.Credentials = new NetworkCredential("1484388229", "123456");
                request.Proxy = proxy;
            }
        }


        /// <summary>
        /// 跟踪cookies
        /// </summary>
        /// <param name="cookies"></param>
        private void trackCookies(CookieCollection cookies)
        {
            if (!isTrackCookies) return;
            if (cookies == null) return;
            foreach (Cookie c in cookies)
            {
                if (cookieDic.ContainsKey(c.Name))
                {
                    cookieDic[c.Name] = c;
                }
                else
                {
                    cookieDic.Add(c.Name, c);
                }
            }

        }

        /// <summary>
        /// 格式cookies
        /// </summary>
        /// <param name="cookies"></param>
        private string getCookieStr()
        {
            StringBuilder sb = new StringBuilder();
            foreach (KeyValuePair<string, Cookie> item in cookieDic)
            {
                if (!item.Value.Expired)
                {
                    if (sb.Length == 0)
                    {
                        sb.Append(item.Key).Append("=").Append(item.Value.Value);
                    }
                    else
                    {
                        sb.Append("; ").Append(item.Key).Append(" = ").Append(item.Value.Value);
                    }
                }
            }
            return sb.ToString();

        }

        /// <summary>
        /// 填充头
        /// </summary>
        /// <param name="request"></param>
        /// <param name="headers"></param>
        private void fillHeaders(HttpWebRequest request, string headers, bool isPrint = false)
        {
            if (request == null) return;
            if (string.IsNullOrWhiteSpace(headers)) return;
            string[] hsplit = headers.Split(new String[] { "\r\n" }, StringSplitOptions.RemoveEmptyEntries);
            foreach (string item in hsplit)
            {
                string[] kv = item.Split(':');
                string key = kv[0].Trim();
                string value = string.Join(":", kv.Skip(1)).Trim();
                if (!UNCHANGEHEADS.Contains(key))
                {
                    request.Headers.Add(key, value);
                }
                else
                {
                    #region  设置http头
                    switch (key)
                    {

                        case "Accept":
                            {
                                request.Accept = value;
                                break;
                            }
                        case "Host":
                            {
                                request.Host = value;
                                break;
                            }
                        case "Connection":
                            {
                                if (value == "keep-alive")
                                {
                                    request.KeepAlive = true;
                                }
                                else
                                {
                                    request.KeepAlive = false;//just test
                                }
                                break;
                            }
                        case "Content-Type":
                            {
                                request.ContentType = value;
                                break;
                            }

                        case "User-Agent":
                            {
                                request.UserAgent = value;
                                break;
                            }
                        case "Referer":
                            {
                                request.Referer = value;
                                break;
                            }

                        case "Content-Length":
                            {
                                request.ContentLength = Convert.ToInt64(value);
                                break;
                            }
                        case "Expect":
                            {
                                request.Expect = value;
                                break;
                            }
                        case "If-Modified-Since":
                            {
                                request.IfModifiedSince = Convert.ToDateTime(value);
                                break;
                            }
                        default:
                            break;
                    }
                    #endregion
                }
            }
            if (isTrackCookies)
            {
                string cookieString = request.Headers[HttpRequestHeader.Cookie];
                var cc = cookieString2Collection(cookieString);
                trackCookies(cc);
            }
            if (!isTrackCookies)
            {
                request.Headers[HttpRequestHeader.Cookie] = "";
            }
            else
            {
                request.Headers[HttpRequestHeader.Cookie] = getCookieStr();
            }

            #region 打印头
            if (isPrint)
            {
                StringBuilder sb = new StringBuilder();
                for (int i = 0; i < request.Headers.AllKeys.Length; i++)
                {
                    string key = request.Headers.AllKeys[i];
                    sb.AppendLine(key + ":" + request.Headers[key]);
                }
                string allHeader=sb.ToString();
            }
            #endregion

        }


        /// <summary>
        /// 打印ResponseHeaders
        /// </summary>
        /// <param name="response"></param>
        private void printResponseHeaders(HttpWebResponse response)
        {
            #region 打印头
            if (response == null) return;
            for (int i = 0; i < response.Headers.AllKeys.Length; i++)
            {
                string key = response.Headers.AllKeys[i];
                System.Console.WriteLine(key + ":" + response.Headers[key]);
            }
            #endregion
        }


        /// <summary>
        /// 返回body内容
        /// </summary>
        /// <param name="response"></param>
        /// <returns></returns>
        private string getResponseBody(HttpWebResponse response)
        {
            Encoding defaultEncode = Encoding.UTF8;
            string contentType = response.ContentType;
            if (contentType != null)
            {
                if (contentType.ToLower().Contains("gb2312"))
                {
                    defaultEncode = Encoding.GetEncoding("gb2312");
                }
                else if (contentType.ToLower().Contains("gbk"))
                {
                    defaultEncode = Encoding.GetEncoding("gbk");
                }
                else if (contentType.ToLower().Contains("zh-cn"))
                {
                    defaultEncode = Encoding.GetEncoding("zh-cn");
                }
            }

            string responseBody = string.Empty;
            if (response.ContentEncoding.ToLower().Contains("gzip"))
            {
                using (GZipStream stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress))
                {
                    using (StreamReader reader = new StreamReader(stream,defaultEncode))
                    {
                        responseBody = reader.ReadToEnd();
                    }
                }
            }
            else if (response.ContentEncoding.ToLower().Contains("deflate"))
            {
                using (DeflateStream stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress))
                {
                    using (StreamReader reader = new StreamReader(stream, defaultEncode))
                    {
                        responseBody = reader.ReadToEnd();
                    }
                }
            }
            else
            {
                using (Stream stream = response.GetResponseStream())
                {
                    using (StreamReader reader = new StreamReader(stream, defaultEncode))
                    {
                        responseBody = reader.ReadToEnd();
                    }
                }
            }
            return responseBody;
        }


        public static string UrlEncode(string item, Encoding code)
        {
            return System.Web.HttpUtility.UrlEncode(item.Trim('\t').Trim(), Encoding.GetEncoding("gb2312"));
        }

        public static string UrlEncodeByGB2312(string item)
        {
            return UrlEncode(item, Encoding.GetEncoding("gb2312"));
        }


        public static string UrlEncodeByUTF8(string item)
        {
            return UrlEncode(item, Encoding.GetEncoding("utf-8"));
        }

        public static string HtmlDecode(string item)
        {
            return WebUtility.HtmlDecode(item.Trim('\t').Trim());
        }

    }

3、基本使用

注:假设我们要采集淘宝的商品信息(商品名称,价格,描述,主图图片,轮播图图片(多张),商品详情等),保存到我们自己的服务器,也就是常说的爬虫技术。
使用时候代码如下:

//要采集的商品URL
var url = "https://detail.tmall.com/item.htm?id=566749001850&ali_refid=a3_430342_1006:1109315178:N:%E6%89%8B%E6%9C%BA:4039c645448a408ba50ca78a42ee5cb6&ali_trackid=1_4039c645448a408ba50ca78a42ee5cb6&spm=a230r.1.0.0&sku_properties=5919063:6536025;12304035:1687525009;122216431:27772";
var httpRequest = new HttpRequestClient(true); //实例化帮助类
String response = httpRequest.httpGet(url, HttpRequestClient.defaultHeaders);//爬虫得到的网页html
var dom = CQ.CreateDocument(response); //使用CQ.CreateDocument()方法创建一个dom对象后,即可通过类似于jquery的方式来获取自己想要获取的dom对象啦,比如下面代码是获取商品的标题
var title = dom[".tb-main-title"].Text(); //商品标题
var subTitle = dom[".tb-subtitle"].Text(); //商品子标题
title = title.Replace("\t", "");
title = title.Replace("\n", "");
title = title.Replace(" ", "");
if (string.IsNullOrWhiteSpace(title))
{
    return 0; //可能title的值无效,也就是没有正确获取title的dom对象
}

//下列代码是获取商品详情URL的
var BeginIndexOf = response.ToString().IndexOf("location.protocol==='http:' ? '");
var len = BeginIndexOf + "location.protocol==='http:' ? '".Length;
var shengxiaStr = response.ToString().Substring(len, response.ToString().Length - len);
var JsonStr = response.ToString().Substring(len, shengxiaStr.IndexOf("' :"));
var detailUrl = "http:" + JsonStr; //商品详情URL
String responseDetail = httpRequest.httpGet(detailUrl, HttpRequestClient.defaultHeaders);//获取商品详情信息
var DetailContent = responseDetail.ToString();
var subLen = "var desc='".Length;
DetailContent = DetailContent.Substring(subLen, DetailContent.Length - subLen - 3);//截取后得到商品详情信息

//获取轮播图列表,第一张作为商品的主图
var picDom = dom["#J_UlThumb li img"];
var pic_id = GetUploadPicIdByUrl(picDom[0].GetAttribute("data-src").Replace("50x50", "400x400"));
//商品图片信息写入并返回图片ID
var detail_content_fin = DetailContent.Replace("\\", "");
//你的项目的业务逻辑,比如将采集到的数据写入数据库等操作

var picListDom = dom["#J_UlThumb li img"];//商品轮播图列表
foreach (var item in picListDom)
{
    //循环上传商品轮播图
    var pic_lunbo_id = GetUploadPicIdByUrl(item.GetAttribute("data-src").Replace("50x50", "400x400"));
    //你的项目的其他业务逻辑处理
}

4、使用代理

注:如果你要采集(爬虫)的网页做了限制,你是没法采集到信息的,比如要采集1688平台的商品信息是需要使用代理的,否则1688平台会将你的服务器的对外IP加入黑名单,没法正常返回页面信息,这时候就需要采用代理(采用代理时加入黑名单的IP是代理IP,跟我们服务器的IP是没关系的)模拟请求,才能正常采集到数据。

注:帮助类里面的fillProxy()需要设置你购买的代理商的账号和密码,可以购买 “快代理” 服务,推荐使用私密代理,返回的代理IP基本都能使用,购买网址为:https://www.kuaidaili.com/pricing/
关键代码如下:

/// <summary>
/// 填充代理
/// </summary>
/// <param name="proxyUri"></param>
private void fillProxy(HttpWebRequest request, string proxyUri)
{
   if (!string.IsNullOrWhiteSpace(proxyUri))
   {
       WebProxy proxy = new WebProxy();
       proxy.Address = new Uri(proxyUri);
       proxy.Credentials = new NetworkCredential("1484388229", "123456");//用户名,密码
       request.Proxy = proxy;
   }
}

//*******************************************************************************************
//使用时如下:
var proxyUrl = "";//代理IP
var url = "";//要采集的商品URL
var httpRequest = new HttpRequestClient(false); //实例化帮助类
String response = httpRequest.httpGet(url, HttpRequestClient.defaultHeaders, proxyUrl: proxyUrl);
var dom = CQ.CreateDocument(response);
var title = dom["#mod-detail-title .d-title"].Text(); //商品标题

猜你喜欢

转载自blog.csdn.net/lbx_15887055073/article/details/82120959
今日推荐