处理父节点 子节点

using HtmlAgilityPack;
using Newtonsoft.Json;
using Newtonsoft.Json.Linq;
using System;
using System.Collections;
using System.Collections.Generic;
using System.IO;

namespace ReadZhihuThread
{
    public class ReadZhihu
    {
        public static void FormatDocument(string document, string outPath)
        {
            Console.WriteLine("Read document start");
            HtmlDocument htmlDocument = new HtmlDocument();
            htmlDocument.LoadHtml(document);
            ////*[@id=\"FreeDefinePlaceholderControl1\"]
            var singleNode = htmlDocument.DocumentNode.SelectSingleNode(".//div[@id=\"zh-topic-organize-page-children\"]");
            var liNodes = singleNode.SelectNodes(".//a[@name=\"topic\"]");


            JArray structure = new JArray();
            //get structure
            var level4 = singleNode.SelectNodes(".//ul/li/ul/li/ul/li/ul/li/a[@name=\"topic\"]");
            List<JArray> list = new List<JArray>();
            int i = 0;
            foreach (var item in level4)
            {

                Console.WriteLine("select nodes: {0}",i++);
                Stack s = new Stack();

                GetParentNode(item, ref s);
                int count = s.Count;
                while (count != 0)
                {
                    structure.Add(s.Pop());
                    count--;
                }
                list.Add(structure);
                if (list.Count > 100)
                {
                    WriteData(ref list, outPath);
                }
            }
            if (list.Count != 0)
            {
                WriteData(ref list, outPath);
            }

        }
        public static void GetParentNode(HtmlNode node, ref Stack s)
        {
            string url = node.GetAttributeValue("href", string.Empty);
            string topic = node.InnerHtml;

            if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(topic)) return;
            if (s.Count > 0 && ((JObject)s.Peek())["topic"].ToString() == topic) return;
            s.Push(GenereateObject(url, topic));

            if (node.ParentNode != null && node.ParentNode.ParentNode != null && node.ParentNode.ParentNode.ParentNode != null && node.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]") != null)
            {
                GetParentNode(node.ParentNode.ParentNode.ParentNode.SelectSingleNode(".//a[1]"), ref s);
            }
        }
        public static JObject GenereateObject(string url, string topic)
        {
            JObject obj = new JObject();
            obj.Add("topic", topic);
            obj.Add("url", url);
            return obj;
        }

        public static void WriteData(ref List<JArray> list, string fileName)
        {
            Console.WriteLine("write data");

            string QnaPath = fileName;//文件存放路径,保证文件存在

            if (!File.Exists(QnaPath))
            {
                File.Create(QnaPath);

            }

            foreach (var item in list)
            {
                JArray outArray = new JArray();
                if (item.Count >= 4)
                {
                    for (int i = 0; i < 4; i++)
                        outArray.Add(item[i]);
                    string json = JsonConvert.SerializeObject(outArray);
                    string temp = File.ReadAllText(fileName);
                    if (!temp.Contains(json))
                    {
                        using (StreamWriter sw = File.AppendText(fileName))
                        {
                            sw.WriteLine(json);
                        }
                    }

                }
            }
            list.Clear();
        }

        public static void ReadSubTopic(string document, string outputPath)
        {
            Console.WriteLine("read subject topic start");

            HtmlDocument htmlDocument = new HtmlDocument();
            htmlDocument.LoadHtml(document);
            //*[@id=\"FreeDefinePlaceholderControl1\"]
            //zm-topic-manage-item-inner
            var singleNode = htmlDocument.DocumentNode.SelectSingleNode(".//div[@id=\"zh-topic-organize-page-children\"]");
            var liNodes = singleNode.SelectNodes(".//a[@name=\"topic\"]");

            var parentChildNodes = htmlDocument.DocumentNode.SelectNodes(".//div[@class=\"zm-topic-manage-item-inner\"]");
            if (parentChildNodes.Count != 2)
            {
                Console.WriteLine("this code has a bug");
            }
            else
            {
                var parentNode = parentChildNodes[0];

                List<JArray> listParent = new List<JArray>();
                //get parent
                int parentLevelCount = 0;
                var parentNodes = parentNode.SelectNodes(".//div[@class=\"zm-topic-tree\"]/ul");
                foreach (var item in parentNodes)
                {
                    Console.WriteLine("deal with parent {0} level data", ++parentLevelCount);
                    JArray array = new JArray();
                    GetChildNode(item.FirstChild.FirstChild, ref array);
                    listParent.Add(array);
                }

                // debug parent nodes
                WriteData(listParent, @"D:\parentNode.json");

                var childNode = parentChildNodes[1];

                //get all child nodes
                Console.WriteLine("get all child nodes");
                var nodes = childNode.SelectNodes(".//li/a[@name=\"topic\"]");

                //get child topic structure
                int i = 0;
                List<string> childlist = new List<string>();
                foreach (var item in nodes)
                {
                    Stack s = new Stack();

                    Console.WriteLine("deal with {0} level", i++);
                    GetParentNode(item, ref s);
                    // genereate child structure
                    Console.WriteLine("generate jarry {0} level", i);
                    childlist.Add(JsonConvert.SerializeObject(GenerateJArry(s)));
                    
                    
                }
                //distinct list
                Console.Write("**********************distinct list");
                string[] childListCp = new string[childlist.Count];
                childlist.CopyTo(childListCp);

                for (int j = childlist.Count - 1; j > -1; j--)
                {
                    Console.WriteLine("distinct {0} level data", j);
                    for (int k = childListCp.Length - 1; k > -1; k--)
                    {
                        if (j == k)
                            continue;
                        string temp = childlist[j];
                        temp = temp.TrimStart('[').TrimEnd(']');
                        if (childListCp[k].Contains(temp))
                        {
                            childlist.RemoveAt(j);
                            break;
                        }
                    }
                }

                //write data

                List<JArray> listAll = new List<JArray>();
                // join parent node data
                foreach (JArray item in listParent)
                {
                    foreach (var childRecord in childlist)
                    {
                        var arr = JArray.Parse(childRecord);
                        
                        var tempArray = item.DeepClone();

                        foreach (JObject element in arr)
                        {
                            ((JArray)tempArray).Add(element);

                        }
                        Console.WriteLine("insert one record:{0}", JsonConvert.SerializeObject(tempArray));
                        listAll.Add((JArray)tempArray);
                    }
                    
                }
                WriteData(listAll, outputPath);
            }

        }

        public static void WriteData(List<string> list, string outputPath)
        {
            foreach (var item in list)
            {
                using (StreamWriter sw = File.AppendText(outputPath))
                {
                   
                    sw.WriteLine(item);
                }
            }
        }

        public static void WriteData(List<JArray> list, string outputPath)
        {
            foreach (var item in list)
            {
                using (StreamWriter sw = File.AppendText(outputPath))
                {

                    sw.WriteLine(JsonConvert.SerializeObject(item));
                }
            }
        }
        public static JArray GenerateJArry(Stack s)
        {
            JArray array = new JArray();
            int stackCount = s.Count;
            while (stackCount != 0)
            {
                array.Add(s.Pop());
                stackCount--;
            }
            return array;
        }

        public static void GetChildNode(HtmlNode node, ref JArray array)
        {
            string url = node.GetAttributeValue("href", string.Empty);
            string topic = node.InnerHtml;

            if (string.IsNullOrEmpty(url) || string.IsNullOrEmpty(topic)) return;
            array.Add(GenereateObject(url, topic));

            if (node.NextSibling !=null && node.NextSibling.NextSibling != null && node.NextSibling.NextSibling.FirstChild != null && node.NextSibling.NextSibling.FirstChild.FirstChild != null )
            {
                GetChildNode(node.NextSibling.NextSibling.FirstChild.FirstChild, ref array);
            }
        }
    }
}

js

// ==UserScript==
// @name        知乎_话题_获取完整话题结构
// @namespace   zhihu
// @include     https://www.zhihu.com/topic/*/organize/entire
// @version     3
// @grant       none
// @description 知乎_话题_每隔1毫秒点击“加载更多”和“显示子话题”
// ==/UserScript==

var count = 0;
function clickitem() {
  
    var items = document.getElementsByName("load");
    var i;
    var itemSel = 0;
    for (i = 0; i < items.length; i++) {
        if (itemSel === 0) {
            itemSel = items[i];
            continue;
        }
        if (itemSel.offsetLeft > items[i].offsetLeft) {
            itemSel = items[i];
            continue;
        } else if (itemSel.offsetLeft == items[i].offsetLeft){
            if (itemSel.text == "显示子话题" && items[i].text == "加载更多") {
                itemSel = items[i];
            }
        }
    }
    count++;
    itemSel.click();
    
}
var sss=setInterval(clickitem, 2000);
  function Start()
  {
        if(sss != null && sss!= undefined)
        {
            window.clearInterval(s1)
        }
        else
        s1=setInterval(clickitem, 1);    
  }
  
 var sta = setInterval(Start, 20000);
 
 
 function clickitem() {
  
    var items = document.getElementsByName("load");
    items[0].click();
    
}

var sta = setInterval(clickitem, 1);

猜你喜欢

转载自www.cnblogs.com/skywss27/p/10009587.html