1、准备工作
在管理NuGet程序包中搜索 CsQuery,安装 CsQuery 到要项目中。
2、基类
类代码如下:
public class HttpRequestClient
{
static HashSet<String> UNCHANGEHEADS = new HashSet<string>();
static HttpRequestClient()
{
UNCHANGEHEADS.Add("Host");
UNCHANGEHEADS.Add("Connection");
UNCHANGEHEADS.Add("User-Agent");
UNCHANGEHEADS.Add("Referer");
UNCHANGEHEADS.Add("Range");
UNCHANGEHEADS.Add("Content-Type");
UNCHANGEHEADS.Add("Content-Length");
UNCHANGEHEADS.Add("Expect");
UNCHANGEHEADS.Add("Proxy-Connection");
UNCHANGEHEADS.Add("If-Modified-Since");
UNCHANGEHEADS.Add("Keep-alive");
UNCHANGEHEADS.Add("Accept");
ServicePointManager.DefaultConnectionLimit = 10000;//最大连接数
}
/// <summary>
/// 默认的头
/// </summary>
public static string defaultHeaders = @"Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
Accept-Encoding:gzip, deflate, sdch
Accept-Language:zh-CN,zh;q=0.8
Cache-Control:no-cache
Connection:keep-alive
Pragma:no-cache
Upgrade-Insecure-Requests:1
User-Agent:Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36";
/// <summary>
/// 是否跟踪cookies
/// </summary>
bool isTrackCookies = false;
/// <summary>
/// cookies 字典
/// </summary>
Dictionary<String, Cookie> cookieDic = new Dictionary<string, Cookie>();
/// <summary>
/// 平均相应时间
/// </summary>
long avgResponseMilliseconds = -1;
/// <summary>
/// 平均相应时间
/// </summary>
public long AvgResponseMilliseconds
{
get
{
return avgResponseMilliseconds;
}
set
{
if (avgResponseMilliseconds != -1)
{
avgResponseMilliseconds = value + avgResponseMilliseconds / 2;
}
else
{
avgResponseMilliseconds = value;
}
}
}
public HttpRequestClient(bool isTrackCookies = false)
{
this.isTrackCookies = isTrackCookies;
}
/// <summary>
/// http请求
/// </summary>
/// <param name="url"></param>
/// <param name="method">POST,GET</param>
/// <param name="headers">http的头部,直接拷贝谷歌请求的头部即可</param>
/// <param name="content">content,每个key,value 都要UrlEncode才行</param>
/// <param name="contentEncode">content的编码</param>
/// <param name="proxyUrl">代理url</param>
/// <returns></returns>
public string http(string url, string method, string headers, string content, Encoding contentEncode, string proxyUrl,string cookiesHeader)
{
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Method = method;
if(method.Equals("GET",StringComparison.InvariantCultureIgnoreCase))
{
request.MaximumAutomaticRedirections = 100;
request.AllowAutoRedirect = false;
}
fillHeaders(request, headers,true);
fillProxy(request, proxyUrl);
request.Headers[HttpRequestHeader.Cookie] = cookiesHeader;
#region 添加Post 参数
if (contentEncode == null)
{
contentEncode = Encoding.UTF8;
}
if (!string.IsNullOrWhiteSpace(content))
{
byte[] data = contentEncode.GetBytes(content);
request.ContentLength = data.Length;
using (Stream reqStream = request.GetRequestStream())
{
reqStream.Write(data, 0, data.Length);
reqStream.Close();
}
}
#endregion
HttpWebResponse response = null;
System.Diagnostics.Stopwatch sw = new System.Diagnostics.Stopwatch();
try
{
sw.Start();
response = (HttpWebResponse)request.GetResponse();
sw.Stop();
AvgResponseMilliseconds = sw.ElapsedMilliseconds;
string cookieString = response.Headers[HttpResponseHeader.SetCookie];
var cc=cookieString2Collection(cookieString);
trackCookies(cc);
}
catch (Exception ex)
{
sw.Stop();
System.Console.WriteLine("请求异常!");
AvgResponseMilliseconds = sw.ElapsedMilliseconds;
return "";
}
string result = getResponseBody(response);
return result;
}
private CookieCollection cookieString2Collection(string cookieString)
{
CookieCollection cc = new CookieCollection();
//string cookieString = response.Headers[HttpResponseHeader.SetCookie];
if (!string.IsNullOrWhiteSpace(cookieString))
{
var spilit = cookieString.Split(';');
foreach (string item in spilit)
{
if (item.Equals("Path=/", StringComparison.InvariantCultureIgnoreCase))
continue;
var kv = item.Split('=');
if (kv.Length == 2)
{
if (kv[0].Trim().StartsWith("HttpOnly,", StringComparison.InvariantCultureIgnoreCase))
{
var cookie = new Cookie(kv[0].Trim().Remove(0, "HttpOnly,".Length), kv[1].Trim());
cc.Add(cookie);
}
else
{
var cookie = new Cookie(kv[0].Trim(), kv[1].Trim());
cc.Add(cookie);
}
}
}
}
return cc;
}
/// <summary>
/// post 请求
/// </summary>
/// <param name="url"></param>
/// <param name="headers"></param>
/// <param name="content"></param>
/// <param name="contentEncode"></param>
/// <param name="proxyUrl"></param>
/// <returns></returns>
public string httpPost(string url, string headers, string content, Encoding contentEncode, string proxyUrl = null, string cookiesHeader = null)
{
return http(url, "POST", headers, content, contentEncode, proxyUrl, cookiesHeader);
}
/// <summary>
/// get 请求
/// </summary>
/// <param name="url"></param>
/// <param name="headers"></param>
/// <param name="content"></param>
/// <param name="proxyUrl"></param>
/// <returns></returns>
public string httpGet(string url, string headers, string content = null, string proxyUrl = null, string cookiesHeader = null)
{
return http(url, "GET", headers, null, null, proxyUrl, cookiesHeader);
}
/// <summary>
/// 填充代理
/// </summary>
/// <param name="proxyUri"></param>
private void fillProxy(HttpWebRequest request, string proxyUri)
{
if (!string.IsNullOrWhiteSpace(proxyUri))
{
WebProxy proxy = new WebProxy();
proxy.Address = new Uri(proxyUri);
proxy.Credentials = new NetworkCredential("1484388229", "123456");
request.Proxy = proxy;
}
}
/// <summary>
/// 跟踪cookies
/// </summary>
/// <param name="cookies"></param>
private void trackCookies(CookieCollection cookies)
{
if (!isTrackCookies) return;
if (cookies == null) return;
foreach (Cookie c in cookies)
{
if (cookieDic.ContainsKey(c.Name))
{
cookieDic[c.Name] = c;
}
else
{
cookieDic.Add(c.Name, c);
}
}
}
/// <summary>
/// 格式cookies
/// </summary>
/// <param name="cookies"></param>
private string getCookieStr()
{
StringBuilder sb = new StringBuilder();
foreach (KeyValuePair<string, Cookie> item in cookieDic)
{
if (!item.Value.Expired)
{
if (sb.Length == 0)
{
sb.Append(item.Key).Append("=").Append(item.Value.Value);
}
else
{
sb.Append("; ").Append(item.Key).Append(" = ").Append(item.Value.Value);
}
}
}
return sb.ToString();
}
/// <summary>
/// 填充头
/// </summary>
/// <param name="request"></param>
/// <param name="headers"></param>
private void fillHeaders(HttpWebRequest request, string headers, bool isPrint = false)
{
if (request == null) return;
if (string.IsNullOrWhiteSpace(headers)) return;
string[] hsplit = headers.Split(new String[] { "\r\n" }, StringSplitOptions.RemoveEmptyEntries);
foreach (string item in hsplit)
{
string[] kv = item.Split(':');
string key = kv[0].Trim();
string value = string.Join(":", kv.Skip(1)).Trim();
if (!UNCHANGEHEADS.Contains(key))
{
request.Headers.Add(key, value);
}
else
{
#region 设置http头
switch (key)
{
case "Accept":
{
request.Accept = value;
break;
}
case "Host":
{
request.Host = value;
break;
}
case "Connection":
{
if (value == "keep-alive")
{
request.KeepAlive = true;
}
else
{
request.KeepAlive = false;//just test
}
break;
}
case "Content-Type":
{
request.ContentType = value;
break;
}
case "User-Agent":
{
request.UserAgent = value;
break;
}
case "Referer":
{
request.Referer = value;
break;
}
case "Content-Length":
{
request.ContentLength = Convert.ToInt64(value);
break;
}
case "Expect":
{
request.Expect = value;
break;
}
case "If-Modified-Since":
{
request.IfModifiedSince = Convert.ToDateTime(value);
break;
}
default:
break;
}
#endregion
}
}
if (isTrackCookies)
{
string cookieString = request.Headers[HttpRequestHeader.Cookie];
var cc = cookieString2Collection(cookieString);
trackCookies(cc);
}
if (!isTrackCookies)
{
request.Headers[HttpRequestHeader.Cookie] = "";
}
else
{
request.Headers[HttpRequestHeader.Cookie] = getCookieStr();
}
#region 打印头
if (isPrint)
{
StringBuilder sb = new StringBuilder();
for (int i = 0; i < request.Headers.AllKeys.Length; i++)
{
string key = request.Headers.AllKeys[i];
sb.AppendLine(key + ":" + request.Headers[key]);
}
string allHeader=sb.ToString();
}
#endregion
}
/// <summary>
/// 打印ResponseHeaders
/// </summary>
/// <param name="response"></param>
private void printResponseHeaders(HttpWebResponse response)
{
#region 打印头
if (response == null) return;
for (int i = 0; i < response.Headers.AllKeys.Length; i++)
{
string key = response.Headers.AllKeys[i];
System.Console.WriteLine(key + ":" + response.Headers[key]);
}
#endregion
}
/// <summary>
/// 返回body内容
/// </summary>
/// <param name="response"></param>
/// <returns></returns>
private string getResponseBody(HttpWebResponse response)
{
Encoding defaultEncode = Encoding.UTF8;
string contentType = response.ContentType;
if (contentType != null)
{
if (contentType.ToLower().Contains("gb2312"))
{
defaultEncode = Encoding.GetEncoding("gb2312");
}
else if (contentType.ToLower().Contains("gbk"))
{
defaultEncode = Encoding.GetEncoding("gbk");
}
else if (contentType.ToLower().Contains("zh-cn"))
{
defaultEncode = Encoding.GetEncoding("zh-cn");
}
}
string responseBody = string.Empty;
if (response.ContentEncoding.ToLower().Contains("gzip"))
{
using (GZipStream stream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress))
{
using (StreamReader reader = new StreamReader(stream,defaultEncode))
{
responseBody = reader.ReadToEnd();
}
}
}
else if (response.ContentEncoding.ToLower().Contains("deflate"))
{
using (DeflateStream stream = new DeflateStream(response.GetResponseStream(), CompressionMode.Decompress))
{
using (StreamReader reader = new StreamReader(stream, defaultEncode))
{
responseBody = reader.ReadToEnd();
}
}
}
else
{
using (Stream stream = response.GetResponseStream())
{
using (StreamReader reader = new StreamReader(stream, defaultEncode))
{
responseBody = reader.ReadToEnd();
}
}
}
return responseBody;
}
public static string UrlEncode(string item, Encoding code)
{
return System.Web.HttpUtility.UrlEncode(item.Trim('\t').Trim(), Encoding.GetEncoding("gb2312"));
}
public static string UrlEncodeByGB2312(string item)
{
return UrlEncode(item, Encoding.GetEncoding("gb2312"));
}
public static string UrlEncodeByUTF8(string item)
{
return UrlEncode(item, Encoding.GetEncoding("utf-8"));
}
public static string HtmlDecode(string item)
{
return WebUtility.HtmlDecode(item.Trim('\t').Trim());
}
}
3、基本使用
注:假设我们要采集淘宝的商品信息(商品名称,价格,描述,主图图片,轮播图图片(多张),商品详情等),保存到我们自己的服务器,也就是常说的爬虫技术。
使用时候代码如下:
//要采集的商品URL
var url = "https://detail.tmall.com/item.htm?id=566749001850&ali_refid=a3_430342_1006:1109315178:N:%E6%89%8B%E6%9C%BA:4039c645448a408ba50ca78a42ee5cb6&ali_trackid=1_4039c645448a408ba50ca78a42ee5cb6&spm=a230r.1.0.0&sku_properties=5919063:6536025;12304035:1687525009;122216431:27772";
var httpRequest = new HttpRequestClient(true); //实例化帮助类
String response = httpRequest.httpGet(url, HttpRequestClient.defaultHeaders);//爬虫得到的网页html
var dom = CQ.CreateDocument(response); //使用CQ.CreateDocument()方法创建一个dom对象后,即可通过类似于jquery的方式来获取自己想要获取的dom对象啦,比如下面代码是获取商品的标题
var title = dom[".tb-main-title"].Text(); //商品标题
var subTitle = dom[".tb-subtitle"].Text(); //商品子标题
title = title.Replace("\t", "");
title = title.Replace("\n", "");
title = title.Replace(" ", "");
if (string.IsNullOrWhiteSpace(title))
{
return 0; //可能title的值无效,也就是没有正确获取title的dom对象
}
//下列代码是获取商品详情URL的
var BeginIndexOf = response.ToString().IndexOf("location.protocol==='http:' ? '");
var len = BeginIndexOf + "location.protocol==='http:' ? '".Length;
var shengxiaStr = response.ToString().Substring(len, response.ToString().Length - len);
var JsonStr = response.ToString().Substring(len, shengxiaStr.IndexOf("' :"));
var detailUrl = "http:" + JsonStr; //商品详情URL
String responseDetail = httpRequest.httpGet(detailUrl, HttpRequestClient.defaultHeaders);//获取商品详情信息
var DetailContent = responseDetail.ToString();
var subLen = "var desc='".Length;
DetailContent = DetailContent.Substring(subLen, DetailContent.Length - subLen - 3);//截取后得到商品详情信息
//获取轮播图列表,第一张作为商品的主图
var picDom = dom["#J_UlThumb li img"];
var pic_id = GetUploadPicIdByUrl(picDom[0].GetAttribute("data-src").Replace("50x50", "400x400"));
//商品图片信息写入并返回图片ID
var detail_content_fin = DetailContent.Replace("\\", "");
//你的项目的业务逻辑,比如将采集到的数据写入数据库等操作
var picListDom = dom["#J_UlThumb li img"];//商品轮播图列表
foreach (var item in picListDom)
{
//循环上传商品轮播图
var pic_lunbo_id = GetUploadPicIdByUrl(item.GetAttribute("data-src").Replace("50x50", "400x400"));
//你的项目的其他业务逻辑处理
}
4、使用代理
注:如果你要采集(爬虫)的网页做了限制,你是没法采集到信息的,比如要采集1688平台的商品信息是需要使用代理的,否则1688平台会将你的服务器的对外IP加入黑名单,没法正常返回页面信息,这时候就需要采用代理(采用代理时加入黑名单的IP是代理IP,跟我们服务器的IP是没关系的)模拟请求,才能正常采集到数据。
注:帮助类里面的fillProxy()需要设置你购买的代理商的账号和密码,可以购买 “快代理” 服务,推荐使用私密代理,返回的代理IP基本都能使用,购买网址为:https://www.kuaidaili.com/pricing/
关键代码如下:
/// <summary>
/// 填充代理
/// </summary>
/// <param name="proxyUri"></param>
private void fillProxy(HttpWebRequest request, string proxyUri)
{
if (!string.IsNullOrWhiteSpace(proxyUri))
{
WebProxy proxy = new WebProxy();
proxy.Address = new Uri(proxyUri);
proxy.Credentials = new NetworkCredential("1484388229", "123456");//用户名,密码
request.Proxy = proxy;
}
}
//*******************************************************************************************
//使用时如下:
var proxyUrl = "";//代理IP
var url = "";//要采集的商品URL
var httpRequest = new HttpRequestClient(false); //实例化帮助类
String response = httpRequest.httpGet(url, HttpRequestClient.defaultHeaders, proxyUrl: proxyUrl);
var dom = CQ.CreateDocument(response);
var title = dom["#mod-detail-title .d-title"].Text(); //商品标题