Java利用phantomjs页面查找高亮截图

背景

前段时间,项目需要实现一个功能:根据已提供的数据(网站URL和对应的错别字)。对网页进行错别字高亮显示并自动截图取证。

经历

起初,因为一些原因,临时用了cdp4j+Robot结合,通过Ctrl + F,然后Ctrl + C/V来模拟手动页面查找进行截图。
弊端:
1.服务器上面需要安装Chrome;
2.截图时,需要当前Chrome窗口保持在顶层,否则会将需要查找的错别字,粘贴到其它地方;
3.由于第二条的限制,所以没法启动多个线程同时截图;

部分代码:

@Override
public void run(){
    Launcher launcher = null;
    Robot robot = null;
    OutputStream out = null;
    Session session = null;
    try{
        launcher = new Launcher();
        SessionFactory factory = launcher.launch();
        session = factory.create();
        session.navigate("需要截图的网站URL");
        session.waitDocumentReady(30000);//最多等待30秒渲染

        robot = new Robot();
        robot.keyPress(KeyEvent.VK_CONTROL);
        robot.keyPress(KeyEvent.VK_F);
        robot.keyRelease(KeyEvent.VK_F);
        robot.keyRelease(KeyEvent.VK_CONTROL);
        Clipboard clipboard = Toolkit.getDefaultToolkit().getSystemClipboard();
        Transferable transferable = new StringSelection("需要查找的错别字");
        clipboard.setContents(transferable, null);
        robot.keyPress(KeyEvent.VK_CONTROL);
        robot.keyPress(KeyEvent.VK_V);
        robot.keyRelease(KeyEvent.VK_V);
        robot.keyRelease(KeyEvent.VK_CONTROL);
        robot.keyPress(KeyEvent.VK_ENTER);
        robot.keyRelease(KeyEvent.VK_ENTER);

            byte[] png = session.captureScreenshot();
            if(png!=null && png.length>0){
                File file = new File(filePath);
                if(!file.mkdirs()){
                    file = new File(filePath);
                }
                String path = filePath + "/"+UUID.randomUUID() + ".png";
                file = new File(path);
                out = new FileOutputStream(file);
                out.write(png);
                out.flush();
                //自己的业务
                .....
            }
    } catch (Exception e) {
        e.printStackTrace();
    }finally {
    	if(session!=null){
            session.close();
        }
   		//关闭当前窗口
        //if(robot!=null){
        //    robot.keyPress(KeyEvent.VK_CONTROL);
        //    robot.keyPress(KeyEvent.VK_W);
        //    robot.keyRelease(KeyEvent.VK_W);
        //    robot.keyRelease(KeyEvent.VK_CONTROL);
        //}
        //结束进程
        if(launcher!=null){
            launcher.getProcessManager().kill();
        }
        //关闭流
        if(out!=null){
            try {
                out.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

目前方法

利用phantomjs进行截图。需要两个东西:1.phantomjs.exe ; 2.js脚本(此处的screenshot.js)
好处:
1.phantomjs是无头浏览器,截图过程中,不会弹出浏览器页面,操作服务器不会造成影响;
Phantomjs官网下载并将exe文件解压到指定目录即可。
2.可以多个线程同时截图;
3.可以截长图;

目录结构
java代码:

    @Override
    public void run() {
        String projectPath = Thread.currentThread().getContextClassLoader().getResource("").getPath();
        String needPath = projectPath.substring(1, projectPath.length() - 16);
        String path = needPath + "phantomjs/";
        String file_name = UUID.randomUUID() + ".jpg";
        String new_file = AppConfig.WORDSSCREENSHOT_DIR + "/" + file_name;//截图保存路径

        String phantomjsExePath = path + "phantomjs.exe";//phantomjs在项目中的路径
        String codejsPath = path + "screenshot.js";//js的路径
        Runtime rt = Runtime.getRuntime();
        Process process = null;
        InputStream is = null;
        try {
        	//url为网站URL,word 需要查找的错别字;每个参数必须用空格隔开
            process = rt.exec(phantomjsExePath + " " + codejsPath + " "
                    + url.trim() + " " + new_file + " " + word);
            is = process.getInputStream();
            BufferedReader br = new BufferedReader(new InputStreamReader(is));
            StringBuffer sbf = new StringBuffer();
            String tmp = "";
            while ((tmp = br.readLine()) != null) {
                sbf.append(tmp);
            }
            is.close();

            System.out.println("url:" + url + " -->截图结束");
            File screenshot = new File(new_file);
            if (screenshot.exists()) {//截图成功
                //自己的业务
                ...
            }
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            if (process != null) {
                process.destroy();
            }
        }
    }

js代码:

/**
 * Created by RYK on 2018/5/24.
 */
var page = require('webpage').create();
system = require('system');

page.viewportSize = {
    width : 1024,
    height : 800
};
page.settings = {
    javascriptEnabled : true,
    loadImages : true,
    userAgent : 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.31 (KHTML, like Gecko) PhantomJS/20.0',
    resourceTimeout:60*1000
};
var address,filePath,value;
if(system.args.length < 4){
    phantom.exit();
}else{
    address = system.args[1];//网站URL
    filePath = system.args[2];//图片保存路径
    value = system.args[3];//需要查找的错别字
    page.open(address, function (status){
        if (status != "success"){
            console.log('FAIL to load the address');
            phantom.exit();
        }
        page.evaluate(function(s){
            window.scrollTo(0,10000);//滚动到页面底部
            window.setTimeout(function(){//将页面上匹配到的文字,加上背景色并进行替换
                var body = document.body;
                var contents = body.innerHTML;
                var reg = new RegExp(s, 'g')
                contents = contents.replace(reg, '<span style="background:yellow;">' + s + '</span>')
                document.body.innerHTML = contents
            },1000);
        },value);
        window.setTimeout(function (){
            page.render(filePath);//截图保存
            phantom.exit();
        }, 5000);
    });
}

不足

测试时,单个网站截图用时较长。原因不详。正式环境可以用多线程弥补。

说明

这是我的第一篇文章,多多指正。
感谢前端同事指导的高亮显示。

猜你喜欢

转载自blog.csdn.net/AD_Marcelo/article/details/88867384