需求:采集携程网酒店信息
步骤:
- 使用火车头编写采集规则并保存到txt文件
- 编写php脚本读取txt文件并按照规则保存到csv
代码:
<?php /** * 读取txt文件,存入csv表格中 */ $file_name = 'hotel_2018-3-8.txt'; $file = fopen($file_name,"r"); $data = []; while(! feof($file)) { $str = fgets($file); $arr_line = filter_my($str); $data[] = $arr_line; } fclose($file); put_csv_my($data); echo 'OK'; /** * 字符串过滤 * @param [type] $str [description] * @return [type] [description] */ function filter_my($str){ //这里写你自己的逻辑 $str = str_replace(' ', '', $str); $str = str_replace('!!', '', $str); $str = str_replace('[]', '', $str); $str = str_replace(' ', '', $str); $tmp_arr = explode(',,', $str); return $tmp_arr; } /** * 写入csv * @param [type] $dataList [description] * @return [type] [description] */ function put_csv_my($dataList){ //这里写你自己的逻辑 $fp = fopen('hotel_2018-3-8.csv', 'w'); fputcsv($fp,array(iconv("UTF-8", "GB2312//IGNORE",'酒店名称'),iconv("UTF-8", "GB2312//IGNORE",'地址'), iconv("UTF-8", "GB2312//IGNORE",'房间数'),iconv("UTF-8", "GB2312//IGNORE",'开业时间'),iconv("UTF-8", "GB2312//IGNORE",'联系电话'))); foreach ($dataList as $data) { $name = iconv("UTF-8", "GB2312//IGNORE",isset($data[0]) ? $data[0] : ''); $addr = iconv("UTF-8", "GB2312//IGNORE",isset($data[1]) ? $data[1] : ''); $house = iconv("UTF-8", "GB2312//IGNORE",isset($data[2]) ? $data[2] : ''); $open = iconv("UTF-8", "GB2312//IGNORE",isset($data[3]) ? $data[3] : ''); $tel = iconv("UTF-8", "GB2312//IGNORE",isset($data[4]) ? $data[4] : ''); fputcsv($fp,array($name,$addr,$house,$open,$tel)); //fputcsv可以用数组循环的方式进行实现 } fclose($fp); }