Java~使用URL建立资源连接实现网络图片爬取

文章目录

核心API

  • InputStream
  • FileOutputStream
  • URL
  • Pattern
  • Matcher
  1. 使用url的openStream获取网络资源输入流对象传给inputStream
  2. 使用Pattern的compile指定我们需要的资源格式
  3. 使用Matcher匹配器, 将我们获取的所有资源与我们compile指定的格式相匹配
  4. 将每个资源链接再次使用url获取其inputStream对象
  5. 最后将inputStream中的资源使用fileoutPutStream传输给电脑文件中

实现

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created with IntelliJ IDEA.
 * Description: If you don't work hard, you will a loser.
 * User: Listen-Y.
 * Date: 2020-09-02
 * Time: 21:19
 */

public class Reptiles {
    
    

    private final String AbsolutePath = "D:\\reptiles\\";

    //图片爬取 count为爬取的照片多少
    public void pictureReptiles(String keyWords) {
    
    
        System.out.println("默认保存路径为:" + AbsolutePath);
        String targetPath = AbsolutePath + keyWords + System.currentTimeMillis();
        File file = new File(targetPath);
        if (!file.exists()) {
    
    
            if (file.mkdirs()) {
    
    
                //图片保存文件夹以创建完毕
                //资源获取
                resourceGet(file, keyWords);
            }
        } else {
    
    
            resourceGet(file, keyWords);
        }

    }

    private void resourceGet(File file, String keyWords) {
    
    
        InputStream inputStream = null;
        FileOutputStream fileOutputStream = null;
        try {
    
    
            //创建url连接
            URL url = new URL("https://image.baidu.com/search/index" +
                    "?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=" + keyWords);
            //打开资源
            inputStream = url.openStream();
            //字节数组用于临时存储资源
            byte[] buffer = new byte[1024];
            //使用StringBuilder保存获取的所有资源
            StringBuilder allResource = new StringBuilder();
            int len = -1;
            while ((len = inputStream.read(buffer)) != -1) {
    
    
                //获取到了资源
                allResource.append(new String(buffer, 0, len, StandardCharsets.UTF_8));
            }

            //使用图片资源的正则表达式获取我们想要的资源
            String allPictureResource = allResource.toString();
            Pattern compile = Pattern.compile("https://.*?0\\.jpg");
            //使用匹配器匹配我们想要的资源
            Matcher matcher = compile.matcher(allPictureResource);

            //查看是否有匹配到的东西 如果有就获取该资源
            List<String> haveDown = new ArrayList<>();
            int count = 1;
            System.out.println("开始爬取...");
            while (matcher.find() ) {
    
    
                //System.out.println("正在爬取第" + count++ + "张...");
                System.out.println("正在爬取ing...");
                //获取匹配到的一条结果
                String pictureResource = matcher.group();
                //如果这个资源的照片已经下载过就不进行再次下载
                if (haveDown.contains(pictureResource)) {
    
    
                    continue;
                }
                //与这个图片资源获得连接
                URL pictureURL = new URL(pictureResource);
                inputStream = pictureURL.openStream();
                //创建文件输出流
                fileOutputStream = new FileOutputStream(file.getPath() + "/" + System.currentTimeMillis() + ".jpg");
                //输入到文件中
                while ((len = inputStream.read(buffer)) != -1) {
    
    
                    fileOutputStream.write(buffer, 0, len);
                }
                //关闭资源
                fileOutputStream.flush();
                fileOutputStream.close();
                inputStream.close();
                haveDown.add(pictureResource);
                //System.out.println("爬取第" + count++ + "张完毕...");
            }

        } catch (IOException e) {
    
    
            e.printStackTrace();
            System.out.println("爬取失败...");
        } finally {
    
    
            if (inputStream != null) {
    
    
                try {
    
    
                    inputStream.close();
                } catch (IOException e) {
    
    
                    e.printStackTrace();
                }
            }
            if (fileOutputStream != null) {
    
    
                try {
    
    
                    fileOutputStream.close();
                } catch (IOException e) {
    
    
                    e.printStackTrace();
                }
            }
            System.out.println("爬取成功...");
        }

    }


}

演示

import java.util.Scanner;

/**
 * Created with IntelliJ IDEA.
 * Description: If you don't work hard, you will a loser.
 * User: Listen-Y.
 * Date: 2020-09-02
 * Time: 22:02
 */
public class Test {
    
    

    public static void main(String[] args) {
    
    
        Reptiles reptiles = new Reptiles();
        System.out.println("===正能量爬虫工具启动===");
        System.out.println();
        System.out.print("输入你要爬取图片的关键字:");
        Scanner scanner = new Scanner(System.in);
        String kewWorlds = scanner.nextLine();
        reptiles.pictureReptiles(kewWorlds);
    }
}

在这里插入图片描述
在这里插入图片描述

猜你喜欢

转载自blog.csdn.net/Shangxingya/article/details/108412512