VBTEST

    Sub InputText()
        On Error Resume Next
        Dim FileAddress As String
        Dim FileNames As String
        Dim Rank As Integer
        FileAddress = ThisWorkbook.Path & "\*.txt" 'txt文件地址
        FileNames = Dir(FileAddress, vbNormal) '获取txt文件名称
        Rank = 1 '使获取txt文件名称的起始顺序为1
        Sheet1.Range("A2:P60000").ClearContents  '清空【租房信息】中的数据内容
        With ThisWorkbook.Sheets(2)
            .Range("A:A").ClearContents '清空存放txt文件名称的单元格
            Do While FileNames <> "" '设置循环获取,直至找不到txt文件名为止
                .Range("A" & Rank) = FileNames '当存在txt文件名时将其输入到【缓存区】A列单元格中
                FileNames = Dir '获取下一个内容
                Rank = Rank + 1 '单元格位置向下移动1位
            Loop
            Dim RangeCount As Integer
            RangeCount = .Range("A10000").End(xlUp).Row '获取导入txt文件名称的总数量
            With .Sort 'A列单元格内容按照拼音顺序进行排序
                .SortFields.Clear
                .SortFields.Add Key:=Range("A1"), SortOn:=xlSortOnValues, Order:=xlAscending, DataOption:=xlSortNormal
                .SetRange Range("A1:A" & RangeCount)
                .Header = xlNo
                .MatchCase = False
                .Orientation = xlTopToBottom
                .SortMethod = xlPinYin
                .Apply
            End With
            
            Dim myText As String
            Dim myArr() As String
            Dim RangeCountText As Integer
            Dim RangeEndRow As Integer
            Dim StartNum As Integer
            Dim EndNum As Integer
            For i = 1 To RangeCount
                .Range("B:D").ClearContents  '在导入新的txt文件内容前清除缓存区中存放txt文件内容的单元格
                Open ThisWorkbook.Path & "\" & .Range("A" & i) For Input As #1 '打开指定的txt文件
                j = 1
                Do While Not EOF(1) '从第一行到最后一样获取txt内容,并将其填充到C、D两列中
                    Line Input #1, myText
                    .Range("C" & j) = Left(myText, 4)
                    .Range("D" & j) = Mid(myText, 6)
                    j = j + 1
                Loop
                Close #1 '关闭txt文件
                
                RangeCountText = .Range("C10000").End(xlUp).Row '获取导入txt文件的行数
                .Range("E:E").ClearContents '清空E列内容
                .Range("E1") = 0 '使E1单元格的值为0
                k = 2
                For j = 1 To RangeCountText '从C1到C列有内容的单元格的最后一行
                    If .Range("C" & j) = "" Then '当C列中的单元格内容为空时
                        .Range("E" & k) = .Range("C" & j).Row  '在E列填充该空单元格所在的行数
                        k = k + 1
                    End If
                Next
                RangeEndRow = .Range("E10000").End(xlUp).Row + 1 '在获取完所有空单元格的行数之后,将C列中最后一个空单元格的位置赋值给RangeEndRow
                .Range("E" & RangeEndRow) = RangeCountText '并使该单元格所在行数填入E列最后一位单元格中
                
                k = 1
                For j = 1 To RangeEndRow '从E1到E列有内容的单元格的最后一行
                    StartNum = .Range("E" & j) '将E列上下相邻的两位数的第一位赋值给StartNum
                    EndNum = .Range("E" & j + 1) '第二位赋值给EndNum
                    If j = RangeEndRow Then Exit For '如果循环到最后一行则停止循环
                    .Range("B" & StartNum + 1 & ":B" & EndNum) = k '将C列txt内容中相同信息的填上相同的序号
                    k = k + 1
                Next
    
                For j = 1 To k - 1
                    With Sheet1
                        RangeEndRow_1 = .Range("A60000").End(xlUp).Row '循环获取【租房信息】中A列不为空单元格的行数
                        .Range("A" & RangeEndRow_1 + 1) = j '使A列顺序填充序号
                        With .Range("B" & RangeEndRow_1 + 1) '在当前序号行中填充数组函数
                            .FormulaArray = "=IFERROR(INDEX(缓存区!C4,MATCH(RC1&R1C,缓存区!C2&缓存区!C3,),1)&"""","""")"
                            .AutoFill Sheet1.Range("B" & RangeEndRow_1 + 1 & ":P" & RangeEndRow_1 + 1), xlFillDefault
                            With Sheet1.Range("B" & RangeEndRow_1 + 1 & ":P" & RangeEndRow_1 + 1) '复制当前序号行内容,并使其内容格式填充为数值
                                .Copy
                                .PasteSpecial Paste:=xlPasteValues, Operation:=xlNone, SkipBlanks:=False, Transpose:=False
                            End With
                        End With
                    End With
                Next
            Next
        End With
    End Sub
#!/usr/bin/python
#-*-coding:UTF-8-*-
import urllib,re,sys,time
UrlMain='http://cs.58.com/csyuhua/zufang/0/j2/?ispic=1&selpic=2'
MainHtml=urllib.urlopen(UrlMain).read()\
          .replace('  ','')\
          .replace('\n','')
ReUrlAreaContent=re.compile('<div class="arealist">(.*?)</div></dd></dl>')
UrlAreaContent=ReUrlAreaContent.findall(MainHtml)[0]
ReUrlArea=re.compile('<a href=(.*?)</a>')
UrlArea=ReUrlArea.findall(UrlAreaContent)
for i in range(len(UrlArea)):
    #每个地区的网址#
    UrlAreaResult='http://cs.58.com'+UrlArea[i].split('\"')[1]
    AreaResultHtml=urllib.urlopen(UrlAreaResult).read()\
                    .replace('  ','')\
                    .replace('\n','')
    #获取住房信息网址#
    ReUrlHouse=re.compile('<h2><a href=(.*?)</a>')
    UrlHouse=ReUrlHouse.findall(AreaResultHtml)
    for j in range(len(UrlHouse)):
        try:
            #当前页每个住房信息的网址
            UrlHouseResult=UrlHouse[j].split('\"')[1]
            HouseResultHtml=urllib.urlopen(UrlHouseResult).read()\
                             .replace('  ','')\
                             .replace('\n','')\
                             .replace('\r','')
            #获取当前网页名称
            ReHouse_Title=re.compile('<title>(.*?)</title>')
            House_Title='网页名称:'+ReHouse_Title.findall(HouseResultHtml)[0]
            if House_Title<>'网页名称:403 Forbidden' or House_Title<>'网页名称:请输入验证码':
                #获取房子租金
                ReHouse_Money=re.compile('<span class="c_ff552e">(.*?)</span><!')
                House_Money=ReHouse_Money.findall(HouseResultHtml)[0]\
                             .replace('<bclass="f36">','')\
                             .replace('</b>','')\
                             .replace('</span>    <spanclass="c_333">',' ')
                #获取租赁方式
                ReHouse_Pay=re.compile('<li><spanclass="c_888 mr_15">(.*?)</span></li>')
                House_Pay=ReHouse_Pay.findall(HouseResultHtml)[0]\
                           .replace(':</span><span>',':')
                #获取房子信息
                ReHouse_Info=re.compile('<li><span class="c_888 mr_15">(.*?)</li>')
                House_Info=ReHouse_Info.findall(HouseResultHtml)
                House_Infos=''
                for k in range(len(House_Info)):
                    House_text=House_Info[k]\
                                      .replace(':</span><span>',':')\
                                      .replace('  ',' ')\
                                      .replace('  </span>','')\
                                      .replace('</span>','')
                    if k==2:
                        House_Infos+=House_text.split('<')[0]\
                                      +House_text.split('>')[1].replace('</a','')+'\n'
                    elif k==3:
                        House_Infos+=House_text.split('<')[0]\
                                      +House_text.split('>')[1].replace('</a',' ')\
                                      +House_text.replace('</a>','').split('>')[2].replace('<emclass="dt c_888 f12"','')+'\n'
                    elif k==4:
                        House_Infos+=House_text.split('<')[0]\
                                      +House_text.split('>')[1].replace('</a',' ')\
                                      +House_text.replace('</a>','').split('>')[2]+'\n'
                    else:
                        House_Infos+=House_text+'\n'
                #获取房子详细地址
                ReHouse_Address=re.compile('<spanclass="c_888 mr_15">(.*?)</span></li>')
                House_text=ReHouse_Address.findall(HouseResultHtml)[1]\
                               .replace(':</span>',':')
                House_Address=House_text.split('<')[0]\
                               +House_text.split('>')[1].replace('</span','')
                #获取房东信息
                ReHouse_People=re.compile('<p class="agent-name f16 pr">(.*?)</i></p>')
                House_text=ReHouse_People.findall(HouseResultHtml)[0].split('>')
                for k in range(len(House_text)):
                    House_People=House_text[1].replace('</a','')\
                                  +House_text[2]
                    if k>5:
                        House_People=House_People+House_text[4]+House_text[6]
                    elif k>3:
                        House_People=House_People+House_text[4]
                    else:
                        House_People=House_People
                    House_People='房东信息:'+House_People\
                                  .replace('<i class="icon pho-approve" title="',' ')\
                                  .replace('"<i class="icon mail-approve" title="',' ')\
                                  .replace('"<i class="icon single-approve" title="',' ')\
                                  .replace('\"','')
                #获取房屋配置
                ReHouse_Disposal=re.compile('<ul class="house(.*?)</ul>')
                House_Disposal=ReHouse_Disposal.findall(HouseResultHtml)[0]
                ReHouse_Allocation=re.compile('<li class="(.*?)</li>')
                House_Allocation=ReHouse_Allocation.findall(House_Disposal)
                House_text=''
                if len(House_Allocation)>1:
                    for k in range(len(House_Allocation)):
                        House_text+='、'+House_Allocation[k].split('</i>')[1]
                    House_Allocation='房屋配置:'+House_text.replace('、','',1)
                else:
                    House_Allocation='房屋配置:'
                #获取房屋其他信息
                ReHouse_Item=re.compile("<ul class='introduce-item'>(.*?)</ul>")
                House_Item=ReHouse_Item.findall(HouseResultHtml)[0]
                ReHouse_OtherInfo=re.compile("<li><span class='a1'>(.*?)</span></li>")
                House_OtherInfo=ReHouse_OtherInfo.findall(HouseResultHtml)
                House_text=''
                for k in range(len(House_OtherInfo)):
                    House_text+=House_OtherInfo[k]+'\n'
                House_OtherInfo=House_text\
                                 .replace("</span><span class='a2'>",':')\
                                 .replace('</em><em>','、')\
                                 .replace('<em>','')\
                                 .replace('</em>','')\
                                 .replace('<p>','')\
                                 .replace('<p >','')\
                                 .replace('</p>','')\
                                 .replace('<b>','')\
                                 .replace('</b>','')\
                                 .replace('<br>','')\
                                 .replace('<br >','')\
                                 .replace('<br />','')\
                                 .replace(' ','')\
                                 .replace('<span >','')\
                                 .replace('<span>','')\
                                 .replace('</span>','')\
                                 .replace('<strong>','')\
                                 .replace('<strong >','')\
                                 .replace('</strong>','')
                House_InfoText='网址链接:'+UrlHouseResult+'\n'\
                                +House_Title+'\n'\
                                +'房子租金:'+House_Money+'\n'\
                                +House_Pay+'\n'\
                                +House_Infos\
                                +House_Address+'\n'\
                                +House_People+'\n'\
                                +House_Allocation+'\n'\
                                +House_OtherInfo
                FileName=UrlArea[i].split('"')[4].replace('>','').decode('utf8').encode('gb2312')+'.txt'
                FileSave=open(sys.path[0]+'/'+FileName,'a')
                FileContent=House_InfoText+'\n'
                FileSave.write(FileContent.decode('utf8').encode('gb2312'))
                FileSave.close
                print House_InfoText+'\n'+str(i)
            else:
                print '\n'
        except:
            print '检查网络情况'
            time.sleep(10)
            next

猜你喜欢

转载自blog.csdn.net/qq_18301257/article/details/79341465