java html标签替换

工作中有时候需要爬取其他平台的文章,或者自己由于兴趣爬取文章,文章后续的处理或展示,一般都是自定义的标签格式,可能与标准的html格式不一致,这里就需要标签替换。

有不少文章都是使用正则表达式来替换,正则表达式替换可能由于匹配问题导致多换或少换。最靠谱的方式当然还是根据index查找某个标签的beginIndex和endIndex,然后进行内容的替换,这样肯定是最准确的,不过效率稍微低一些!

代码中主要展示了,针对<img src=’%s’ mimetype="%s" width=’%s’ height=’%s’ alt=’%s’ />\n";替换为
<img data-src=’%s’ data-mimetype="%s" data-width=’%s’ data-height=’%s’ alt=’%s’ />\n";样式。并在img前添加div及占位符。

@Data
public class ArticleMedia {
    /**
     * 视频/图片高度
     */
    private int height;
    /**
     * 视频/图片宽度
     */
    private int width;
}

    private static ArticleMedia getImageInfo(String path) {
        ArticleMedia articleMedia = new ArticleMedia();
        try {
            URL url = new URL(path);
            URLConnection connection = url.openConnection();
            connection.setDoOutput(true);
            BufferedImage image = ImageIO.read(connection.getInputStream());
            articleMedia.setHeight(image.getHeight());
            articleMedia.setWidth(image.getWidth());
        } catch (IOException ex) {
            log.warn("path {} file not exist", path);
        } catch (Exception ex) {
            log.warn("path {} file may not image", path);
        } finally {
            log.info("path {} article info {}", path, articleMedia);
        }
        return articleMedia;
    }
    
    private static String formatContent(String content) {
        if (content == null || content.length() == 0) return null;
        boolean hasImageDiv = false;
        if (content.contains("<div class=\"pgc-img\">")) {
            content = content.replaceAll("<div class=\"pgc-img\">", "<div class=\"my-custom-image\">");
            hasImageDiv = true;
        }
        int imgBeginIndex = 0, imgEndIndex = 0;
        StringBuilder stringBuilder = new StringBuilder(content);
        while ((imgBeginIndex = stringBuilder.indexOf("<img", imgEndIndex)) >= 0 && (imgEndIndex = stringBuilder.indexOf(">", imgBeginIndex)) > 0) {
            int imgSrcIndex = stringBuilder.indexOf("src=", imgBeginIndex);
            String picUrl = stringBuilder.substring(imgSrcIndex + 5, stringBuilder.indexOf("\"", imgSrcIndex + 5));
            String mimeType = ImageUtils.getMimeType(picUrl);
            if (StringUtils.isBlank(mimeType)) {
                log.error("unknown mimetype of url:{}", picUrl);
                imgEndIndex += 5;
                continue;
            }
            StringBuilder sb = new StringBuilder();
            int widthIndex = stringBuilder.indexOf("width", imgSrcIndex);
            int heightIndex = stringBuilder.indexOf("height", imgSrcIndex);
            int altIndex = stringBuilder.indexOf("alt", imgSrcIndex);
            int width = 0, height = 0;
            String alt = "";
            if (widthIndex > 0 && widthIndex < imgEndIndex) {
                width = Integer.valueOf(stringBuilder.substring(widthIndex + 7, stringBuilder.indexOf("\"", widthIndex + 7)));
            }
            if (heightIndex > 0 && heightIndex < imgEndIndex) {
                height = Integer.valueOf(stringBuilder.substring(heightIndex + 8, stringBuilder.indexOf("\"", heightIndex + 8)));
            }
            if (altIndex > 0 && altIndex < imgEndIndex) {
                alt = stringBuilder.substring(altIndex + 5, stringBuilder.indexOf("\"", altIndex + 5));
            }
            // 未获取到高和宽
            if (height == 0 && width == 0) {
                ArticleMedia media = getImageInfo(picUrl);
                height = media.getHeight();
                width = media.getWidth();
            }
            sb.append(imgReplace(picUrl, mimeType, width, height, alt, hasImageDiv));
            stringBuilder.delete(imgBeginIndex, imgEndIndex + 1);
            stringBuilder.insert(imgBeginIndex, sb);
            imgEndIndex = imgBeginIndex + sb.length();
        }
        return stringBuilder.toString();
    }

    private static String imgReplace(String imgUrl, String mimeType, int width, int height, String alt, boolean hasImageDiv) {
        StringBuilder sb = new StringBuilder();
        // 查找image替换
        if (!hasImageDiv) {
            sb.append("\n            <div class=\"my-custom-image\">\n");
        }
        String templateStr = "               <!-- 图片占位元素,padding-bottom 的值由图片本身真实的高度除以宽度转化为百分数 -->\n" +
                "                <div class=\"my-custom-imageplaceholder\" style=\"width: %.0f%%; padding-bottom: %.3f%%;\"></div>\n" +
                "                <img data-src='%s' data-mimetype=\"%s\" data-width='%s' data-height='%s' alt='%s' />\n";
        sb.append(String.format(templateStr, 100/1.0, height * 100 / width * 1.0, imgUrl, mimeType, width, height, alt));
        if (!hasImageDiv) {
            sb.append("            </div>\n");
        }
        return sb.toString();
    }

    public static void main(String[] args) {
        String str = "<img src=\"http://p3.pstatp.com/large/pgc-image/f67024509a3e4d639bdd885d4dd9b41a\" width=\"640\" height=\"458\" alt=\"清朝:买官和考科举当官,哪一个难度更大?\" inline=\"0\"><p class=\"pgc-img-caption\"></p><p class=\"ql-align-justify\">\u200B</p><p class=\"ql-align-justify\">先来说说科举,科举出现于隋唐,在明、清时形成定制。当时科举的选题不仅有固定的判断标准,而且监考过程极为严格。</p><p class=\"ql-align-justify\">清朝因为科举作弊的案件,杀过许多考官。对于一个读书人来说,当他们走上科举这条路后,就必须先参加地方举办的童生试。</p><p class=\"ql-align-justify\"><br></p><img src=\"http://p1.pstatp.com/large/pgc-image/822159119cf940d3b210002b777c31b8\" width=\"640\" height=\"427\" alt=\"清朝:买官和考科举当官,哪一个难度更大?\" inline=\"0\">";

        System.out.println(formatContent(str));
    }
发布了35 篇原创文章 · 获赞 81 · 访问量 3万+

猜你喜欢

转载自blog.csdn.net/chanllenge/article/details/90404917
今日推荐