核心提示:网页正文提取。写这个的原因,最近在改一个网页正文提取的插件,但找遍了网站就是没有JS版的,于是乎就找了个Java版的代码。进行了修改。经测试 可用;代码自取;let content = getArti...
网页正文提取。写这个的原因,最近在改一个网页正文提取的插件,但找遍了网站就是没有JS版的,于是乎就找了个Java版的代码。进行了修改。
经测试 可用;
代码自取;
let content = getArticleContent($('body').html());
/** * 摘取文章主体正文算法 * @param body * @returns {正文} */ let getArticleContent = function (body) { /** * 行分块的大小(块大小=BLOCKS+1) */ let BLOCKS = 0; /** * 判断为正文的文字骤变率 */ let CHANGE_RATE = 0.9; /** * 每行最小长度 */ let MIN_LENGTH = 3; let html = body; /** * 去除html标签 * @param html 请求获得的html文本 * @return 纯文本 */ let deleteLabel = function (html) { let regEx_script = /<script\b[^<]*(?:(?!<\ script="">)<[^<]*)*<\/script>/g; // 定义script的正则表达式 let regEx_style = /)<[^<]*)*<\/stype>/g; // 定义style的正则表达式 let regEx_html = /<(?:.|\s)*?>/g; // 定义HTML标签的正则表达式 html = html.replace(regEx_script, ""); html = html.replace(regEx_style, ""); html = html.replace(regEx_html, ""); html = html.replace("((\r\n)|\n)[\\s\t ]*(\\1)+", "$1").replace("^((\r\n)|\n)", "");//去除空白行 html = html.replace(" +| +| +", ""); //去除空白 return html.trim(); }; let b_html = deleteLabel(html); /** * 将纯文本按BLOCKS分块 * @param text 纯文本 * @return 分块后的map集合,键即为块号,值为块内容 */ let splitBlock = function (text) { let groupMap = new Array(); let bais = text; let br = text.split('\n'); let line = null, blocksLine = ""; let theCount = 0, groupCount = 0, count = 0;//1.记录每次添加的行数;2.记录块号;3.记录总行数 for (let i = 0; i < br.length; i++) { line = br[i]; if (line != '') { if (line.length > MIN_LENGTH) { if (theCount <= BLOCKS) { blocksLine += line.trim(); theCount++; } else { if (blocksLine != undefined) { groupMap[groupCount] = blocksLine; groupCount++; blocksLine = line.trim(); theCount = 1; } } count++; } } } if (theCount != 0 && blocksLine != undefined) {//加上没凑齐的给给定块数的 groupMap[groupCount + 1] = blocksLine; } return groupMap; }; let o_html = splitBlock(b_html); /** * 分析每块之间变化的情况 * @param map 块集合 * @return 正文 */ let judgeBlocks = function (map) { let sets = map; let contentBlock = []; let currentBlock = map.length; //当前行的长度 let lastBlock = 0; //上一行的长度 for (let i = 0; i < sets.length; i++) { if (sets[i] != undefined) { lastBlock = currentBlock; currentBlock = sets[i].length; let between = Math.abs(currentBlock - lastBlock) / Math.max(currentBlock, lastBlock); if (between >= CHANGE_RATE) { contentBlock.push(i); } } } //下面是取多个峰值节点中两个节点之间内容长度最大的内容 let matchNode = contentBlock.length; let lastContent = 0;//前一个两节点之间的内容长度 let context = null;//结果 if (matchNode > 2) { for (let i = 1; i < matchNode; i++) { let result = ""; for (let j = contentBlock[i - 1]; j < contentBlock[i]; j++) { result += map[j]; } if (result.length > lastContent) { lastContent = result.length; context += result; } } } return context; }; let articleContent = judgeBlocks(o_html); return articleContent; }; </script\b[^<]*(?:(?!<\>