c#抓取网页(带解析js)

抓取中国银行汇率 (phantomjs-1.9.2-windows + Selenium.WebDriver.3.8.0)

直接上代码

using LTITools.util;
using OpenQA.Selenium;
using OpenQA.Selenium.PhantomJS;
using System;
using System.Collections.Generic;
using System.Data;
using System.IO;
using System.Linq;
using System.Text.RegularExpressions;
using System.Threading;
using System.Windows.Forms;

namespace LTITools
{

    /// <summary>
    /// 1.第一次访问,头部会写cookies,同时返回的是js,js是加密后的字符串,需要反序列号然后执行。
    /// 2.js会判断当前的浏览器window的宽高
    /// 3.js执行后会再次写cookies,同时跳转指定的解析出来的地址。
    /// 4.第二个地址回写cookies,同时头部302跳转。
    /// 5.后续需要带着一二次的访问返回的cookies进行访问。
    /// </summary>
    public partial class ChinaBankRate : Form
    {
        public ChinaBankRate()
        {
            InitializeComponent();
            InitData();
        }

        int _foreachPageCount = 1;
        string _url = "";
        int _totalCount = 1;
        int _totalPage = 0;
        int _stopMSec = 1000;
        IEnumerable<ChinaBankRateListItem> monthList;


        private void InitData()
        {
            txtUrl.Text = "http://www.pbc.gov.cn/zhengcehuobisi/125207/125217/125925/17105/index{0}.html";
            chkClearOldData.Checked = false;

            dtBeginDate.Text = DateTime.Now.AddMonths(-1).ToShortDateString();
            dtBeginDate.CustomFormat = "yyyy-MM";
            dtEndDate.CustomFormat = "yyyy-MM";
            dtBeginDate.Format = DateTimePickerFormat.Custom;
            dtEndDate.Format = DateTimePickerFormat.Custom;
            dtBeginDate.MinDate = Convert.ToDateTime("2015-8-1");
            dtEndDate.MinDate = Convert.ToDateTime("2015-8-1");
            dtEndDate.MaxDate = DateTime.Now;

            txtAbout.AppendText(" 1.请先通过[第一步,数据抓取]Tab进行抓取,抓取会遇到IP禁用、防抓取网络异常等," +
                "如有异常,可以进行多次抓取(注:抓取过程中会弹出黑框界面,抓取完成后会自动关闭);");
            txtAbout.AppendText("\n 2.抓取成功后,通过[第二步,数据导出]导出指定月份的数据(注:如果抓取过程中,则不能进行导出);");
            txtAbout.AppendText("\n 3.仅能导出指定当月日期最大的汇率数据(注:仅支持导出2015年8月以后的数据);");
        }

        private void ClearData()
        {
            File.Delete(GetExcelPath());
        }


        /// <summary>
        /// 第一轮先抓取列表数据
        /// 第二轮在循环列表数据抓取具体内容
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void btnOk_Click(object sender, EventArgs e)
        {
            btnOk.Enabled = false;
            btnExportData.Enabled = false;
            _url = txtUrl.Text.Trim();
            _stopMSec = Convert.ToInt32(txtStop.Text.Trim());
            _stopMSec = _stopMSec < 100 ? 100 : _stopMSec;

            if (chkClearOldData.Checked)
            {
                ClearData();
            }
            Thread t = new Thread(new ThreadStart(DoWorkGetList));
            t.IsBackground = true;
            t.Start();
        }


        private void GotoURLAndCheckCookies(PhantomJSDriver driver, string url)
        {
            //var js = "var w= window.innerWidth||document.documentElement.clientWidth||document.body.clientWidth;var h= window.innerHeight||document.documentElement.clientHeight||document.body.clientHeight;"
            //    + "document.body.setAttribute(\"wh\", w*h);";
            //driver.ExecutePhantomJS(js);
            //var wh = driver.FindElement(By.TagName("body")).GetAttribute("wh");

            //if (driver.PageSource.Contains("dynamicurl"))
            //{
            //    driver.ExecuteScript("document.body.setAttribute(\"cookieString\", HXXTTKKLLPPP5);");
            //    var cookieString = driver.FindElement(By.TagName("body")).GetAttribute("cookieString");
            //    var newjs = cookieString.Replace("if(findDimensions())", "if(false)")
            //        .Replace("if(findDimensions())", "if(false)");
            //    driver.ExecutePhantomJS(newjs + " HXXTTKKLLPPP5();");
            //} 
        }


        private void DoWorkGetList()
        {
            Action<String> AsynclblResultAsy = delegate(string text) { lblResult.Text = text; };
            Action<String> AsyncUIDelegateResult = delegate(string text) { txtResult.AppendText(text); };
            Action AsyncUIDelegateDone = delegate() { btnOk.Enabled = true; };
            Action AsyncUIDelegateExportDone = delegate { btnExportData.Enabled = true; };


            txtResult.Invoke(AsyncUIDelegateResult, new object[] { DateTime.Now.ToShortTimeString() + "开始执行,正在抓取列表数据...\n" });

            PhantomJSDriver driver = new PhantomJSDriver(GetPhantomJSDriverService());
            driver.Manage().Window.Size = new System.Drawing.Size() { Height = 800, Width = 600 };

            var ExistDatalists = GetExistData();
            var historyTotalCount = ExistDatalists.Count();

            for (var i = 1; i <= _foreachPageCount; i++)
            {
                var url = string.Format(_url, i);


                driver.Navigate().GoToUrl(url);
                ///随机暂停 
                int randKey = new Random().Next(100, _stopMSec);
                Thread.Sleep(randKey);

                //总条数和分页
                if (i == 1)
                {
                    Thread.Sleep(_stopMSec);
                    var totalinfos = driver.FindElement(By.CssSelector("td[class='Normal']"));
                    if (null == totalinfos || !string.IsNullOrEmpty(totalinfos.Text))
                    {
                        Thread.Sleep(_stopMSec);
                        totalinfos = driver.FindElement(By.CssSelector("td[class='Normal']"));
                    }


                    if (null != totalinfos && !string.IsNullOrEmpty(totalinfos.Text))
                    {
                        _totalCount = Convert.ToInt32(totalinfos.Text.Split(',')[0].Split(':')[1]);

                        if (historyTotalCount != _totalCount)
                        {
                            if ((_totalCount - historyTotalCount) % 20 != 0)
                                _totalPage = (_totalCount - historyTotalCount) / 20 + 1;
                            else _totalPage = (_totalCount - historyTotalCount) / 20;
                        }
                        else
                        {
                            break;
                        }
                        _foreachPageCount = _totalPage;
                    }
                    else
                    {
                        txtResult.Invoke(AsyncUIDelegateResult,
                            new object[] { "totalinfos为空,抓取异常,请稍后试\n" });
                        break;
                    }
                }

                ///如果当前也超出
                if (i > _foreachPageCount)
                    break;

                //list数据
                var lists = driver.FindElements(By.CssSelector("font[class='newslist_style'] > a"));


                if (lists == null || lists.Count == 0)
                {
                    Thread.Sleep(_stopMSec);
                    lists = driver.FindElements(By.CssSelector("font[class='newslist_style'] > a"));
                }

                if (lists == null || lists.Count == 0)
                {
                    txtResult.Invoke(AsyncUIDelegateResult, new object[] { "lists为空,抓取异常,请稍后试\n" });
                    break;
                }
                var breakFlag = "";
                foreach (var item in lists)
                {
                    var identifierDate = item.Text.Split('中')[0].Trim();
                    if (!ExistDatalists.Any(t => t.Identifier == identifierDate) && Convert.ToDateTime(identifierDate) > Convert.ToDateTime("2015-8-1"))
                    {
                        ExistDatalists.Add(new ChinaBankRateListItem()
                        {
                            PIdentifier = i.ToString(),
                            Identifier = identifierDate,
                            Href = item.GetAttribute("href"),
                            Title = item.Text,
                            IsSucess = "true",
                            HtmlContent = "",
                        });
                    }
                    else
                    {
                        breakFlag = identifierDate;
                        break;
                    }
                }
                if (!string.IsNullOrEmpty(breakFlag))
                {
                    txtResult.Invoke(AsyncUIDelegateResult, new object[] { "当前已包含 " + breakFlag + "\n" });
                    break;
                }

                txtResult.Invoke(AsyncUIDelegateResult, new object[] { "处理完成行第[" + i + "]条列表(暂停" + randKey + "毫秒),url:" + url + "\n" });

            }

            SaveDataToExcel(ExistDatalists.OrderByDescending(t => Convert.ToDateTime(t.Identifier)));
            txtResult.Invoke(AsyncUIDelegateResult, new object[] { "列表数据抓取完成!\n" });

            //开始抓取列表数据
            var blret = DoWorkGetDetail(driver, AsyncUIDelegateResult, AsynclblResultAsy);

            btnOk.Invoke(AsyncUIDelegateDone);
            btnExportData.Invoke(AsyncUIDelegateExportDone);
            lblResult.Invoke(AsynclblResultAsy, new object[] { string.Format("全部处理完成({0}),{1}", (blret ? "成功" : "有异常,请继续点击抓取开始"), DateTime.Now) });

            driver.Quit();
        }


        private bool DoWorkGetDetail(PhantomJSDriver driver, Action<String> AsyncUIDelegate, Action<String> AsynclblResultAsy)
        {
            bool blret = true;
            var ExistDatalists = GetExistData();
            try
            {
                var items = ExistDatalists.Where(t => string.IsNullOrEmpty(t.HtmlContent)).ToArray();
                lblResult.Invoke(AsynclblResultAsy, new object[] { string.Format("开始抓取详情页面,总共{0}条数据...", items.Count()) });

                for (var i = 0; i < items.Count(); i++)
                {
                    driver.Navigate().GoToUrl(items[i].Href);
                    ///随机暂停 
                    int randKey = new Random().Next(100, _stopMSec);
                    Thread.Sleep(randKey);

                    var content = driver.FindElementByCssSelector("div[id='zoom'] > p");
                    if (content == null || string.IsNullOrEmpty(content.Text))
                    {
                        Thread.Sleep(_stopMSec);
                        content = driver.FindElementByCssSelector("div[id='zoom'] > p");
                    }

                    if (content != null && !string.IsNullOrEmpty(content.Text))
                    {
                        items[i].HtmlContent = content.Text;
                        items[i].IsSucess = "true";
                    }
                    else
                    {
                        items[i].IsSucess = "false";
                    }
                    txtResult.Invoke(AsyncUIDelegate, new object[] { string.Format("处理第[{0}]条(暂停{1}毫秒),日期[{2}] \n", i + 1, randKey, items[i].Identifier) });

                }

            }
            catch (Exception ex)
            {
                blret = false;
                txtResult.Invoke(AsyncUIDelegate, new object[] { "详情数据抓取异常" + ex.StackTrace + "\n" });
            }
            SaveDataToExcel(ExistDatalists);
            txtResult.Invoke(AsyncUIDelegate, new object[] { "详情数据抓取完成!\n" });
            txtResult.Invoke(AsyncUIDelegate, new object[] { "全部数据抓取完成!\n" });
            return blret;
        }


        /// <summary>
        /// 设置代理
        /// </summary>
        /// <returns></returns>
        private static PhantomJSDriverService GetPhantomJSDriverService()
        {
            PhantomJSDriverService pds = PhantomJSDriverService.CreateDefaultService();
            //设置代理服务器地址
            //pds.Proxy = $"{ip}:{port}";  
            //设置代理服务器认证信息
            //pds.ProxyAuthentication = GetProxyAuthorization();
            return pds;
        }


        private bool SaveDataToExcel(IEnumerable<ChinaBankRateListItem> list)
        {
            var dt = new DataTable();

            dt.Columns.Add("Identifier");
            dt.Columns.Add("PIdentifier");
            dt.Columns.Add("Title");
            dt.Columns.Add("IsSucess");
            dt.Columns.Add("Href");
            dt.Columns.Add("HtmlContent");
            DataRow dr = null;

            foreach (var item in list)
            {
                dr = dt.NewRow();
                var index = 0;
                dr[index++] = item.Identifier;
                dr[index++] = item.PIdentifier;
                dr[index++] = item.Title;
                dr[index++] = item.IsSucess;
                dr[index++] = item.Href;
                dr[index++] = item.HtmlContent;
                dt.Rows.Add(dr);
            }

            File.Delete(GetExcelPath());
            ExcelHelper.SaveExcelToFile(GetExcelPath(), dt);
            return true;
        }

        private string GetExcelPath(string name = "data")
        {
            DirectoryInfo baseDir = new DirectoryInfo(AppDomain.CurrentDomain.BaseDirectory);
            return baseDir + "\\" + name + ".xlsx";
        }

        private List<ChinaBankRateListItem> GetExistData()
        {
            var path = GetExcelPath();
            var list = new List<ChinaBankRateListItem>();
            if (File.Exists(path))
            {
                var dt = ExcelHelper.ReadExcelFile(path, 0);
                foreach (DataRow row in dt.Rows)
                {
                    list.Add(new ChinaBankRateListItem()
                    {
                        Href = row["Href"].ToString().Trim(),
                        HtmlContent = row["HtmlContent"].ToString().Trim(),
                        Identifier = row["Identifier"].ToString().Trim(),
                        Title = row["Title"].ToString().Trim(),
                        IsSucess = row["IsSucess"].ToString().Trim(),
                        PIdentifier = row["PIdentifier"].ToString().Trim(),
                    });
                }
            }
            return list;
        }

        /// <summary>
        /// 导出数据
        /// </summary>
        /// <param name="sender"></param>
        /// <param name="e"></param>
        private void btnExportData_Click(object sender, EventArgs e)
        {
            btnExportData.Enabled = false;
            btnExportData.Text = "数据导出中...";

            var list = GetExistData();
            var beginDate = new DateTime(dtBeginDate.Value.Year,dtBeginDate.Value.Month,1) ;
            var endDate = new DateTime(dtEndDate.Value.Year, dtEndDate.Value.Month, 1).AddMonths(1).AddDays(-1); 

            var export = list.Where(t => beginDate <= Convert.ToDateTime(t.Identifier)
                && Convert.ToDateTime(t.Identifier) <= endDate).OrderByDescending(t => Convert.ToDateTime(t.Identifier));
            monthList = export.GroupBy(t => Convert.ToDateTime(t.Identifier).ToString("yyyy-MM")).Select(t => t.First());

            //截图
            Thread t1 = new Thread(new ThreadStart(ScreenCapture));
            t1.IsBackground = true;
            t1.Start();


            var allExportData = new List<ChinaBankRateExport>();
            foreach (var item in monthList)
            {
                var datas = GetRateFromHtmlContent(Convert.ToDateTime(item.Identifier), item.HtmlContent);
                allExportData.AddRange(datas);
            }

            var dt = new DataTable();
            dt.Columns.Add("起兑币种");
            dt.Columns.Add("兑换币种");
            dt.Columns.Add("汇率");
            dt.Columns.Add("状态");
            dt.Columns.Add("生效日期");
            dt.Columns.Add("备注");
            DataRow dr = null;
            foreach (var item in allExportData)
            {
                dr = dt.NewRow();
                var index = 0;
                dr[index++] = item.From;
                dr[index++] = item.To;
                dr[index++] = item.Rate;
                dr[index++] = item.Status;
                dr[index++] = item.EffectiveDate;
                dr[index++] = item.Des;
                dt.Rows.Add(dr);
            }


            string baseDir = new DirectoryInfo(AppDomain.CurrentDomain.BaseDirectory) + "\\Export";
            if (!Directory.Exists(baseDir)) Directory.CreateDirectory(baseDir);
            var fullPath = baseDir + "\\From" + beginDate.ToString("yyyyMM") + "To"
                + endDate.ToString("yyyyMM")
                + "_" + DateTime.Now.ToString("yyyyMMddHHmmss") + ".xlsx";

            ExcelHelper.SaveExcelToFile(fullPath, dt);
            System.Diagnostics.Process.Start(fullPath);
            btnExportData.Enabled = true;
            btnExportData.Text = "导出指定日期数据";
            lblExportResult.Text = "导出完成!" + DateTime.Now.ToShortDateString();
        }


        private List<ChinaBankRateExport> GetRateFromHtmlContent(DateTime dt, string htmlContent)
        {
            //var reg = @"(?i)(?<={0})(\d+(\.\d+)?)(?={1})";
            var exportRate = new List<ChinaBankRateExport>();
            var CNYtoFlags = new string[] { "人民币1元对" };
            var toCNYFlags = new string[] { "对人民币" };

            var arrHtml = htmlContent.Split(new char[] { ',', ',' }, StringSplitOptions.RemoveEmptyEntries);

            decimal rate = 0M;
            var reg = new Regex(@"\d+\.\d*");

            foreach (var item in RateDic.RateNameDic)
            {
                foreach (var html in arrHtml)
                {
                    if (html.Contains(item.Key))
                    {
                        var math = reg.Match(html);
                        if (math.Success)
                        {
                            rate = Convert.ToDecimal(math.Value);
                            if (CNYtoFlags.Any(t => html.Contains(t)))
                            {  //人民币对外币需要转换成外币对人民币
                                rate = MathHelper.Round6P(1 / rate);
                            }
                            else if (toCNYFlags.Any(t => html.Contains(t)))
                            {

                            }

                            if (html.Contains("100日元"))
                            {
                                rate = MathHelper.Round6P(rate / 100);
                            }

                            exportRate.Add(new ChinaBankRateExport()
                             {
                                 Des = item.Key,
                                 EffectiveDate = dt,
                                 From = item.Value,
                                 To = "CNY",
                                 Rate = rate,
                                 Status = "有效",
                             });

                            break;
                        }
                    }

                }
            }

            return exportRate;



        }


        private void ScreenCapture()
        {
            Action AsynclbtnShowImg = delegate { btnShowImg.Visible = true; };
            string baseDir = new DirectoryInfo(AppDomain.CurrentDomain.BaseDirectory) + "\\Export\\Capture\\";
            if (!Directory.Exists(baseDir)) Directory.CreateDirectory(baseDir);
            try
            {
                foreach (var item in monthList)
                {
                    var fullPath = baseDir + item.Identifier + ".png";
                    if (!File.Exists(fullPath))
                    {
                        System.Diagnostics.Process p = new System.Diagnostics.Process();
                        p.StartInfo.WindowStyle = System.Diagnostics.ProcessWindowStyle.Hidden;
                        p.StartInfo.FileName = "phantomjs.exe";
                        p.StartInfo.WorkingDirectory = AppDomain.CurrentDomain.BaseDirectory;
                        p.StartInfo.Arguments = " rasterize.js  " + item.Href + " " + fullPath;//启动参数    
                        p.Start();
                        p.WaitForExit(5000);
                    }
                }
            }
            catch (Exception ex)
            {
                // lblExportResult.Invoke(AsynclblExportResult, new object[] { ex.StackTrace });
            }

            lblExportResult.Invoke(AsynclbtnShowImg);
        }

        private void btnShowImg_Click(object sender, EventArgs e)
        {
            string baseDir = new DirectoryInfo(AppDomain.CurrentDomain.BaseDirectory) + "\\Export\\Capture";
            System.Diagnostics.Process.Start(baseDir);
        }


    }



    #region 实体类
    public class ChinaBankRateListItem
    {
        public string Identifier { set; get; }//日期 
        public string PIdentifier { set; get; }//父标识  
        public string Title { set; get; }
        public string IsSucess { set; get; }

        public string Href { set; get; }

        public string HtmlContent { set; get; }
    }

    /*
    HKD	港币
    IDR	印度尼西亚卢比
    INR	印度卢比
    USD	美元
    EUR	欧元
    GBP	英镑
    TWD	新台币
    CAD	加拿大元
    MXN	墨西哥比索
    AUD	澳大利亚元
    BRL	巴西雷阿尔
    KRW	韩国元
    MYR	马来西亚林吉特
    JPY	日元
    ZAR	南非兰特
    THB	泰国铢
    CHF	瑞士法郎
    SGD	新加坡元
    NZD	新西兰元
    PHP	菲律宾比索
    MOP	澳门元
    CNY	人民币
    NZD  新西兰元
    SGD 新加坡
    RUB 俄罗斯卢布
    KRW 韩元
         */

    /// <summary>
    /// 刚开始是正则匹配,但是特色字符,空格出现问题,后面通过字符串截取和包含解决问题。
    /// </summary>
    public static class RateDic
    {

        public static Dictionary<string, string> RateNameDic { set; get; }

        static RateDic()
        {
            RateNameDic = new Dictionary<string, string>();
            RateNameDic.Add("美元", "USD");
            RateNameDic.Add("印度卢比", "INR");
            RateNameDic.Add("欧元", "EUR");
            RateNameDic.Add("日元", "JPY");
            RateNameDic.Add("港元", "HKD");
            RateNameDic.Add("英镑", "GBP");
            RateNameDic.Add("澳大利亚元", "AUD");
            RateNameDic.Add("新西兰元", "NZD");
            RateNameDic.Add("新加坡元", "SGD");
            RateNameDic.Add("瑞士法郎", "CHF");
            RateNameDic.Add("加拿大元", "CAD");
            RateNameDic.Add("俄罗斯卢布", "RUB");

            RateNameDic.Add("林吉特", "MYR");
            RateNameDic.Add("南非兰特", "ZAR");
            RateNameDic.Add("韩元", "KRW");
            //RateNameDic.Add("阿联酋迪拉姆", "AED");
            // RateNameDic.Add("沙特里亚尔", "SAR");
            //  RateNameDic.Add("匈牙利福林", "HUF");
            // RateNameDic.Add("波兰兹罗提", "PLN");
            // RateNameDic.Add("丹麦克朗", "DKK");
            //  RateNameDic.Add("瑞典克朗", "SEK");
            //  RateNameDic.Add("挪威克朗", "NOK");
            //RateNameDic.Add("土耳其里拉", "TRY");
            RateNameDic.Add("墨西哥比索", "MXN");


        }

    }

    public class ChinaBankRateExport
    {

        public string From { set; get; }

        public string To { set; get; }

        public decimal Rate { set; get; }

        public DateTime EffectiveDate { set; get; }

        public string Status { set; get; }

        public string Des { set; get; }

    }


    #endregion




}




1.这个网站不是直接通过ajax请求数据,如果是这样通过等待就可以抓取到数据,该网站先是通过返回的js生成cookie,然后带上cookie访问动态地址,然后再生成cookie,带上所有的cookie,再去访问302,最后得到结果。具体看参考http://www.jianshu.com/p/11fac0596020

2.参考抓取获取cookies https://www.cnblogs.com/songxingzhu/p/7110723.html

3.获取里面的js变量 http://michaelthelin.se/javascript/testing/webdriver/2013/02/14/webdriver-reading-the-value-of-a-javascript-variable-spoiler-weirdness.html 


参考2:

1.这个网站处理办法如下:

1、进入搜索页面,得到js
2、htmlfile.write反混淆js,得到类似的两个函数function KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU(str)和function QWERTASDFGXYSF()
3、运行这两个函数,得到两个cookie
        cookieString = "wzwstemplate=" + KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU(template.toString()) + "; path=/";
        var confirm = QWERTASDFGXYSF();
        cookieString = "wzwschallenge=" + KTKY2RBD9NHPBCIHV9ZMEQQDARSLVFDU(confirm.toString()) + "; path=/";
4、根据dynamicurl中的地址,带着三个Cookie: wzwsconfirm=   wzwstemplate=  wzwschallenge= 
      得到Cookie ccpassport=,和320跳转
5、带着4个cookie,经过两次320,就可以进入search页面,获得JSESSIONID,后面就好办了

这个网站比较复杂。


参考3:

1.http://www.cnblogs.com/endlock/p/6423613.html 使用Selenium来操作PhantomJS绝配

2.收费:https://www.nrecosite.com/phantomjs_wrapper_net.aspx

3.抓取中行 http://xusheng.org/blog/2016/10/19/ru-he-zhua-qu-diao-cha-tong-ji-si-de-shu-ju/ 

4.Webdriver: Reading the value of a Javascript variable (spoiler: weirdness):

 http://michaelthelin.se/javascript/testing/webdriver/2013/02/14/webdriver-reading-the-value-of-a-javascript-variable-spoiler-weirdness.html

5. Python小记:selenium+PhantomJS爬虫解决页面js添加cookie : https://www.jianshu.com/p/11fac0596020


猜你喜欢

转载自blog.csdn.net/paolei/article/details/78802726
今日推荐