代码如下:
#!/usr/bin/env python # -*- coding: utf-8 -*- #2018/05/17 import os import requests import re import xml.sax import xml.dom.minidom as XmlDocument class MovieHandler(xml.sax.ContentHandler): def __init__(self): self.index = 0 self.KB = '' self.VulType = '' self.CurrentData = '' self.data = [] # 元素开始事件处理 def startElement(self, tag, attributes): if tag == 'Vul': self.index += 1 self.CurrentData = tag # 元素结束事件处理 def endElement(self, tag): # print self.KB,self.VulType if tag == 'Vul': print({'KB': self.KB[2:], 'Type': self.VulType}) self.data.append({'KB': self.KB[2:], 'Type': self.VulType}) return # 内容事件处理 def characters(self, content): if content == '': return if self.CurrentData == 'KB': self.KB = content elif self.CurrentData == 'VulType': self.VulType = content def get(self): return self.data # 解析xml文件,获取所有kb号 def get_dirname(): parser = xml.sax.make_parser() # turn off namepsaces parser.setFeature(xml.sax.handler.feature_namespaces, 0) # 重写 ContextHandler Handler = MovieHandler() parser.setContentHandler(Handler) parser.parse('TVL00001.tvl.xml') #需要解析的xml文件路径(全量库文件) data = Handler.get() return data # 输入系统的kb号,输出需要去重的数据 def get_patch(KB, Type): result = [] url = 'http://www.catalog.update.microsoft.com/Search.aspx?q=' + KB content = requests.get(url).content s = requests.session() s.keep_alive = False Microsoftservername = re.findall('for (.*) \(KB.*\)', content) # 输出系统名称 upid = re.findall('goToDetails\("(.*)"\)', content) for i in range(len(Microsoftservername)): get_new_kb = get_patch_info(upid[i]) if system_matching(Microsoftservername[i]) == None or get_new_kb == KB: continue result.append([KB, Type, system_matching(Microsoftservername[i]), get_new_kb]) return result # 输入非系统的kb号,输出需要去重的数据 def get_patch_2(KB, Type): result = [] url = 'http://www.catalog.update.microsoft.com/Search.aspx?q=' + KB content = requests.get(url).content s = requests.session() s.keep_alive = False upid = re.findall('goToDetails\("(.*)"\)', content) if len(upid) == 0: return None for i in range(len(upid)): get_new_kb = get_patch_info(upid[i]) if get_new_kb == KB: continue result.append([KB, Type, '', get_new_kb]) return result # 在第二个网页输入upid,获取最新补丁 def get_patch_info(id): url = 'http://www.catalog.update.microsoft.com/ScopedViewInline.aspx?updateid=' + id content = requests.get(url).content s = requests.session() s.keep_alive = False upid = re.findall("updateid=(.{36})", content) # 获取补丁对应的网址数据,有updateid的是data3 data2 = re.findall("\(KB(.{1,8})\)", content)# 获取到的第一个kb号为上一个输入的kb号 if len(data2) == 0: data2 = re.findall("/kb/(.*)\"", content) # 获取不到kb号,采用另外一种策略 if len(upid) == 1: return data2[0] else: return get_patch_info(upid[1]) # 有多个被替换的,然后输入第一个,返回的网页找不到upid # 系统名称匹配 def system_matching(Microsoftservername):#字典代表能够识别的系统添加匹配的系统 dict = {'Windows 7': 'enumSV_Windows_7', 'Windows 7 for x64-based Systems': 'enumSV_Windows_7_X64', 'Windows Server 2008 R2 for x64-based Systems': 'enumSV_Windows_2008_R2_X64', 'Windows Server 2008': 'enumSV_Windows_2008', 'Windows Vista': 'enumSV_WinVista', 'Windows Server 2008 for x64-based Systems': 'enumSV_Windows_2008_X64', 'Windows Vista for x64-based Systems': 'enumSV_WinVista_X64', 'Windows Server 2012 R2 for x64-based Systems': 'enumSV_Windows_2012_R2_X64', 'Windows 8.1 for x64-based Systems': 'enumSV_Windows_8_SP1_X64', 'Windows Server 2012 for x64-based Systems': 'enumSV_Windows_2012_X64', 'Windows Embedded 8 Standard for x64-based Systems': 'enumSV_Windows_8_X64', 'Windows 7 for x86-based Systems': 'enumSV_Windows_7', 'Windows 8.1 for x86-based Systems': 'enumSV_Windows_8_SP1', 'Windows 8.1': 'enumSV_Windows_8_SP1', 'Windows 8': 'enumSV_Windows_8', 'Windows 8 for x64-based Systems': 'enumSV_Windows_8_X64', 'Windows Server 2003 for x64-based Systems': 'enumSV_2003_X64', 'Windows XP for x64-based Systems': 'enumSV_WinXp_X64', 'Windows Server 2003': 'enumSV_2003', 'Windows XP': 'enumSV_WinXp', 'Windows Server 2012 R2': 'enumSV_Windows_2012_R2_X64', 'Windows Server 2008 R2 x64 Edition': 'enumSV_Windows_2008_R2_X64', 'Windows Server 2003 x64 Edition': 'enumSV_2003_X64', 'Windows Server 2008 x64 Edition': 'enumSV_Windows_2008_X64', 'Windows Server 2012': 'enumSV_Windows_2012_X64', 'Windows XP x64 Edition': 'enumSV_WinXp_X64', 'Server 2008 R2 for x64': 'enumSV_Windows_2008_R2_X64', 'Server 2008 x64': 'enumSV_Windows_2008_X64', 'Windows Vista for x64 based Systems': 'enumSV_WinVista_X64' } if Microsoftservername in dict: return dict[Microsoftservername] return None #将数据进行去重 def Duplicate_removal(Vuli): if len(Vuli) == 2 and Vuli[0] == Vuli[1]: return Vuli[0] index = 0 out_list = [] while len(Vuli): out_list.append([]) out_list[index].append(Vuli[0]) j = 1 if len(Vuli) == j: break while True: if Vuli[0][3] == Vuli[j][3]: out_list[index].append(Vuli.pop(j)) else: j += 1 if j == len(Vuli): Vuli.pop(0) break index += 1 return out_list #去重之后,将系统名称进行合并 def System_merging(Vulis): out = [] for sub_vuli in Vulis: data = [sub_vuli[0][0],sub_vuli[0][1],[],sub_vuli[0][3]] for i in sub_vuli: data[2].append(i[2]) out.append(data) return out # 生成xml格式的文件 # input ['1111','2',['win7','win8'],'22222'] def create_xml_files(doc, Vu): Vul = doc.createElement('Vul') VulList.appendChild(Vul) kbid = doc.createElement('kbid') Vul.appendChild(kbid) kbid.appendChild(doc.createTextNode(Vu[0])) VulType = doc.createElement('VulType') Vul.appendChild(VulType) VulType.appendChild(doc.createTextNode(Vu[1])) ByReplaceList = doc.createElement('ByReplaceList') Vul.appendChild(ByReplaceList) ByKbid = doc.createElement('ByKbid') ByReplaceList.appendChild(ByKbid) ByKbid.appendChild(doc.createTextNode(Vu[3])) SystemList = doc.createElement('SystemList') Vul.appendChild(SystemList) if Vu[2] == ['']: SystemList.appendChild(doc.createTextNode('\n' + '\t')) return 'system is None' # SystemList.appendChild(doc.createTextNode('')) for sub_system in Vu[2]: System = doc.createElement('System') SystemList.appendChild(System) System.appendChild(doc.createTextNode(sub_system)) if __name__ == '__main__': if os.path.exists('VulExpir.xml'): os.remove('VulExpir.xml') result = [] kb_number = get_dirname() news_kb_number = [] for kb in kb_number: if kb not in news_kb_number: news_kb_number.append(kb) # xml 根节点 doc = XmlDocument.Document() VulExpiration = doc.createElement('VulExpiration') doc.appendChild(VulExpiration) VulList = doc.createElement('VulList') VulExpiration.appendChild(VulList) # xml 循环写入 num = len(news_kb_number) n = 1 for i in news_kb_number: print '当前进度%d,总数%d' % (n, num) n+=1 if i['Type'] == '2': Vulist = get_patch_2(i['KB'], i['Type']) else: Vulist = get_patch(i['KB'], i['Type']) # 输出需要进行去重的多个值 if Vulist == None: continue Vulis = Duplicate_removal(Vulist) if len(Vulis) == 4 and isinstance(Vulis[0], unicode): create_xml_files(doc, Vulis) continue Vuli = System_merging(Vulis) for Vu in Vuli: create_xml_files(doc, Vu) with open('VulExpir.xml', 'w') as f: doc.writexml(f, addindent=' ', newl='\n', encoding='utf-16') f.close()