Sub InputText()
On Error Resume Next
Dim FileAddress As String
Dim FileNames As String
Dim Rank As Integer
FileAddress = ThisWorkbook.Path & "\*.txt" 'txt文件地址
FileNames = Dir(FileAddress, vbNormal) '获取txt文件名称
Rank = 1 '使获取txt文件名称的起始顺序为1
Sheet1.Range("A2:P60000").ClearContents '清空【租房信息】中的数据内容
With ThisWorkbook.Sheets(2)
.Range("A:A").ClearContents '清空存放txt文件名称的单元格
Do While FileNames <> "" '设置循环获取,直至找不到txt文件名为止
.Range("A" & Rank) = FileNames '当存在txt文件名时将其输入到【缓存区】A列单元格中
FileNames = Dir '获取下一个内容
Rank = Rank + 1 '单元格位置向下移动1位
Loop
Dim RangeCount As Integer
RangeCount = .Range("A10000").End(xlUp).Row '获取导入txt文件名称的总数量
With .Sort 'A列单元格内容按照拼音顺序进行排序
.SortFields.Clear
.SortFields.Add Key:=Range("A1"), SortOn:=xlSortOnValues, Order:=xlAscending, DataOption:=xlSortNormal
.SetRange Range("A1:A" & RangeCount)
.Header = xlNo
.MatchCase = False
.Orientation = xlTopToBottom
.SortMethod = xlPinYin
.Apply
End With
Dim myText As String
Dim myArr() As String
Dim RangeCountText As Integer
Dim RangeEndRow As Integer
Dim StartNum As Integer
Dim EndNum As Integer
For i = 1 To RangeCount
.Range("B:D").ClearContents '在导入新的txt文件内容前清除缓存区中存放txt文件内容的单元格
Open ThisWorkbook.Path & "\" & .Range("A" & i) For Input As #1 '打开指定的txt文件
j = 1
Do While Not EOF(1) '从第一行到最后一样获取txt内容,并将其填充到C、D两列中
Line Input #1, myText
.Range("C" & j) = Left(myText, 4)
.Range("D" & j) = Mid(myText, 6)
j = j + 1
Loop
Close #1 '关闭txt文件
RangeCountText = .Range("C10000").End(xlUp).Row '获取导入txt文件的行数
.Range("E:E").ClearContents '清空E列内容
.Range("E1") = 0 '使E1单元格的值为0
k = 2
For j = 1 To RangeCountText '从C1到C列有内容的单元格的最后一行
If .Range("C" & j) = "" Then '当C列中的单元格内容为空时
.Range("E" & k) = .Range("C" & j).Row '在E列填充该空单元格所在的行数
k = k + 1
End If
Next
RangeEndRow = .Range("E10000").End(xlUp).Row + 1 '在获取完所有空单元格的行数之后,将C列中最后一个空单元格的位置赋值给RangeEndRow
.Range("E" & RangeEndRow) = RangeCountText '并使该单元格所在行数填入E列最后一位单元格中
k = 1
For j = 1 To RangeEndRow '从E1到E列有内容的单元格的最后一行
StartNum = .Range("E" & j) '将E列上下相邻的两位数的第一位赋值给StartNum
EndNum = .Range("E" & j + 1) '第二位赋值给EndNum
If j = RangeEndRow Then Exit For '如果循环到最后一行则停止循环
.Range("B" & StartNum + 1 & ":B" & EndNum) = k '将C列txt内容中相同信息的填上相同的序号
k = k + 1
Next
For j = 1 To k - 1
With Sheet1
RangeEndRow_1 = .Range("A60000").End(xlUp).Row '循环获取【租房信息】中A列不为空单元格的行数
.Range("A" & RangeEndRow_1 + 1) = j '使A列顺序填充序号
With .Range("B" & RangeEndRow_1 + 1) '在当前序号行中填充数组函数
.FormulaArray = "=IFERROR(INDEX(缓存区!C4,MATCH(RC1&R1C,缓存区!C2&缓存区!C3,),1)&"""","""")"
.AutoFill Sheet1.Range("B" & RangeEndRow_1 + 1 & ":P" & RangeEndRow_1 + 1), xlFillDefault
With Sheet1.Range("B" & RangeEndRow_1 + 1 & ":P" & RangeEndRow_1 + 1) '复制当前序号行内容,并使其内容格式填充为数值
.Copy
.PasteSpecial Paste:=xlPasteValues, Operation:=xlNone, SkipBlanks:=False, Transpose:=False
End With
End With
End With
Next
Next
End With
End Sub
#!/usr/bin/python
#-*-coding:UTF-8-*-
import urllib,re,sys,time
UrlMain='http://cs.58.com/csyuhua/zufang/0/j2/?ispic=1&selpic=2'
MainHtml=urllib.urlopen(UrlMain).read()\
.replace(' ','')\
.replace('\n','')
ReUrlAreaContent=re.compile('<div class="arealist">(.*?)</div></dd></dl>')
UrlAreaContent=ReUrlAreaContent.findall(MainHtml)[0]
ReUrlArea=re.compile('<a href=(.*?)</a>')
UrlArea=ReUrlArea.findall(UrlAreaContent)
for i in range(len(UrlArea)):
#每个地区的网址#
UrlAreaResult='http://cs.58.com'+UrlArea[i].split('\"')[1]
AreaResultHtml=urllib.urlopen(UrlAreaResult).read()\
.replace(' ','')\
.replace('\n','')
#获取住房信息网址#
ReUrlHouse=re.compile('<h2><a href=(.*?)</a>')
UrlHouse=ReUrlHouse.findall(AreaResultHtml)
for j in range(len(UrlHouse)):
try:
#当前页每个住房信息的网址
UrlHouseResult=UrlHouse[j].split('\"')[1]
HouseResultHtml=urllib.urlopen(UrlHouseResult).read()\
.replace(' ','')\
.replace('\n','')\
.replace('\r','')
#获取当前网页名称
ReHouse_Title=re.compile('<title>(.*?)</title>')
House_Title='网页名称:'+ReHouse_Title.findall(HouseResultHtml)[0]
if House_Title<>'网页名称:403 Forbidden' or House_Title<>'网页名称:请输入验证码':
#获取房子租金
ReHouse_Money=re.compile('<span class="c_ff552e">(.*?)</span><!')
House_Money=ReHouse_Money.findall(HouseResultHtml)[0]\
.replace('<bclass="f36">','')\
.replace('</b>','')\
.replace('</span> <spanclass="c_333">',' ')
#获取租赁方式
ReHouse_Pay=re.compile('<li><spanclass="c_888 mr_15">(.*?)</span></li>')
House_Pay=ReHouse_Pay.findall(HouseResultHtml)[0]\
.replace(':</span><span>',':')
#获取房子信息
ReHouse_Info=re.compile('<li><span class="c_888 mr_15">(.*?)</li>')
House_Info=ReHouse_Info.findall(HouseResultHtml)
House_Infos=''
for k in range(len(House_Info)):
House_text=House_Info[k]\
.replace(':</span><span>',':')\
.replace(' ',' ')\
.replace(' </span>','')\
.replace('</span>','')
if k==2:
House_Infos+=House_text.split('<')[0]\
+House_text.split('>')[1].replace('</a','')+'\n'
elif k==3:
House_Infos+=House_text.split('<')[0]\
+House_text.split('>')[1].replace('</a',' ')\
+House_text.replace('</a>','').split('>')[2].replace('<emclass="dt c_888 f12"','')+'\n'
elif k==4:
House_Infos+=House_text.split('<')[0]\
+House_text.split('>')[1].replace('</a',' ')\
+House_text.replace('</a>','').split('>')[2]+'\n'
else:
House_Infos+=House_text+'\n'
#获取房子详细地址
ReHouse_Address=re.compile('<spanclass="c_888 mr_15">(.*?)</span></li>')
House_text=ReHouse_Address.findall(HouseResultHtml)[1]\
.replace(':</span>',':')
House_Address=House_text.split('<')[0]\
+House_text.split('>')[1].replace('</span','')
#获取房东信息
ReHouse_People=re.compile('<p class="agent-name f16 pr">(.*?)</i></p>')
House_text=ReHouse_People.findall(HouseResultHtml)[0].split('>')
for k in range(len(House_text)):
House_People=House_text[1].replace('</a','')\
+House_text[2]
if k>5:
House_People=House_People+House_text[4]+House_text[6]
elif k>3:
House_People=House_People+House_text[4]
else:
House_People=House_People
House_People='房东信息:'+House_People\
.replace('<i class="icon pho-approve" title="',' ')\
.replace('"<i class="icon mail-approve" title="',' ')\
.replace('"<i class="icon single-approve" title="',' ')\
.replace('\"','')
#获取房屋配置
ReHouse_Disposal=re.compile('<ul class="house(.*?)</ul>')
House_Disposal=ReHouse_Disposal.findall(HouseResultHtml)[0]
ReHouse_Allocation=re.compile('<li class="(.*?)</li>')
House_Allocation=ReHouse_Allocation.findall(House_Disposal)
House_text=''
if len(House_Allocation)>1:
for k in range(len(House_Allocation)):
House_text+='、'+House_Allocation[k].split('</i>')[1]
House_Allocation='房屋配置:'+House_text.replace('、','',1)
else:
House_Allocation='房屋配置:'
#获取房屋其他信息
ReHouse_Item=re.compile("<ul class='introduce-item'>(.*?)</ul>")
House_Item=ReHouse_Item.findall(HouseResultHtml)[0]
ReHouse_OtherInfo=re.compile("<li><span class='a1'>(.*?)</span></li>")
House_OtherInfo=ReHouse_OtherInfo.findall(HouseResultHtml)
House_text=''
for k in range(len(House_OtherInfo)):
House_text+=House_OtherInfo[k]+'\n'
House_OtherInfo=House_text\
.replace("</span><span class='a2'>",':')\
.replace('</em><em>','、')\
.replace('<em>','')\
.replace('</em>','')\
.replace('<p>','')\
.replace('<p >','')\
.replace('</p>','')\
.replace('<b>','')\
.replace('</b>','')\
.replace('<br>','')\
.replace('<br >','')\
.replace('<br />','')\
.replace(' ','')\
.replace('<span >','')\
.replace('<span>','')\
.replace('</span>','')\
.replace('<strong>','')\
.replace('<strong >','')\
.replace('</strong>','')
House_InfoText='网址链接:'+UrlHouseResult+'\n'\
+House_Title+'\n'\
+'房子租金:'+House_Money+'\n'\
+House_Pay+'\n'\
+House_Infos\
+House_Address+'\n'\
+House_People+'\n'\
+House_Allocation+'\n'\
+House_OtherInfo
FileName=UrlArea[i].split('"')[4].replace('>','').decode('utf8').encode('gb2312')+'.txt'
FileSave=open(sys.path[0]+'/'+FileName,'a')
FileContent=House_InfoText+'\n'
FileSave.write(FileContent.decode('utf8').encode('gb2312'))
FileSave.close
print House_InfoText+'\n'+str(i)
else:
print '\n'
except:
print '检查网络情况'
time.sleep(10)
next