网页正文提取-前端--易笔记-weiyer.com

　　核心提示：网页正文提取。写这个的原因，最近在改一个网页正文提取的插件，但找遍了网站就是没有JS版的，于是乎就找了个Java版的代码。进行了修改。经测试可用；代码自取；let content = getArti...

网页正文提取。写这个的原因，最近在改一个网页正文提取的插件，但找遍了网站就是没有JS版的，于是乎就找了个Java版的代码。进行了修改。

经测试可用；

代码自取；

let content = getArticleContent($('body').html());

	/**
         * 摘取文章主体正文算法
         * @param body
         * @returns {正文}
         */
        let getArticleContent = function (body) {
            /**
             * 行分块的大小(块大小=BLOCKS+1)
             */
            let BLOCKS = 0;
            /**
             * 判断为正文的文字骤变率
             */
            let CHANGE_RATE = 0.9;
            /**
             * 每行最小长度
             */
            let MIN_LENGTH = 3;

            let html = body;


            /**
             * 去除html标签
             * @param html 请求获得的html文本
             * @return 纯文本
             */
            let deleteLabel = function (html) {
                let regEx_script = /<script\b[^<]*(?:(?!<\ script="">)<[^<]*)*<\/script>/g; // 定义script的正则表达式
                let regEx_style = /)<[^<]*)*<\/stype>/g; // 定义style的正则表达式
                let regEx_html = /<(?:.|\s)*?>/g; // 定义HTML标签的正则表达式

                html = html.replace(regEx_script, "");
                html = html.replace(regEx_style, "");
                html = html.replace(regEx_html, "");
                html = html.replace("((\r\n)|\n)[\\s\t ]*(\\1)+", "$1").replace("^((\r\n)|\n)", "");//去除空白行
                html = html.replace("    +| +|　+", ""); //去除空白
                return html.trim();
            };

            let b_html = deleteLabel(html);

            /**
             * 将纯文本按BLOCKS分块
             * @param text 纯文本
             * @return 分块后的map集合,键即为块号,值为块内容
             */
            let splitBlock = function (text) {
                let groupMap = new Array();
                let bais = text;
                let br = text.split('\n');
                let line = null,
                    blocksLine = "";
                let theCount = 0,
                    groupCount = 0,
                    count = 0;//1.记录每次添加的行数；2.记录块号；3.记录总行数

                for (let i = 0; i < br.length; i++) {
                    line = br[i];
                    if (line != '') {
                        if (line.length > MIN_LENGTH) {
                            if (theCount <= BLOCKS) {
                                blocksLine += line.trim();
                                theCount++;
                            }
                            else {
                                if (blocksLine != undefined) {
                                    groupMap[groupCount] = blocksLine;
                                    groupCount++;
                                    blocksLine = line.trim();
                                    theCount = 1;
                                }
                            }
                            count++;
                        }
                    }

                }

                if (theCount != 0 && blocksLine != undefined) {//加上没凑齐的给给定块数的
                    groupMap[groupCount + 1] = blocksLine;
                }

                return groupMap;
            };

            let o_html = splitBlock(b_html);

            /**
             * 分析每块之间变化的情况
             * @param map 块集合
             * @return 正文
             */
            let judgeBlocks = function (map) {
                let sets = map;
                let contentBlock = [];
                let currentBlock = map.length; //当前行的长度
                let lastBlock = 0; //上一行的长度
                for (let i = 0; i < sets.length; i++) {
                    if (sets[i] != undefined) {
                        lastBlock = currentBlock;
                        currentBlock = sets[i].length;
                        let between = Math.abs(currentBlock - lastBlock) / Math.max(currentBlock, lastBlock);

                        if (between >= CHANGE_RATE) {
                            contentBlock.push(i);
                        }
                    }
                }

                //下面是取多个峰值节点中两个节点之间内容长度最大的内容
                let matchNode = contentBlock.length;

                let lastContent = 0;//前一个两节点之间的内容长度
                let context = null;//结果
                if (matchNode > 2) {
                    for (let i = 1; i < matchNode; i++) {
                        let result = "";
                        for (let j = contentBlock[i - 1]; j < contentBlock[i]; j++) {
                            result += map[j];
                        }
                        if (result.length > lastContent) {
                            lastContent = result.length;
                            context += result;
                        }

                    }
                }

                return context;
            };

            let articleContent = judgeBlocks(o_html);

            return articleContent;
        }; </script\b[^<]*(?:(?!<\>

网页正文提取

时间：2017/3/16 9:24:00 点击：

本类热门

本类推荐

本类固顶

网页正文提取

时间：2017/3/16 9:24:00 点击：

您可能感兴趣的文章

相关评论

发表我的评论

本类热门

本类推荐

本类固顶