js抓取网络小说内容下载

  • js抓取网络小说内容方法1
/**
 * 在小说目录页面按F12打开控制台,复制粘贴,
 * 执行本函数下载目录中的小说章节,
 * 参数removeString是你想要删除的字符串,也可以是字符串数组
 */
function simpleDownloadStoryInCatalogueWeb(removeString=""){
    //目录页url
    const catalogueWebUrl = document.location.origin+document.location.pathname;
    //小说名称
    const storyTitle = document.querySelector('title').innerText;
    //找到所有的a标签
    const aAll = document.getElementsByTagName('a');
    const cataloguesTmp = [];
    for(const a of aAll){
        //筛选出href以目录页开头的a标签即认为是章节目录
        if(a&&a.href&&a.href.startsWith(catalogueWebUrl))
            cataloguesTmp.push(a);
    }
    //移除所有重复的a标签
    for(let l=cataloguesTmp.length,i=l-1;i>=0;--i){
        const aHref = cataloguesTmp[i]&&cataloguesTmp[i].href;
        if(aHref)
            for (let j=0;j<i;++j){
                const aj = cataloguesTmp[j];
                if(aj&&aHref===aj.href)
                    cataloguesTmp[j]=null;
            }
    }
    const catalogues = cataloguesTmp.filter(a=>a);
    let charset = 'utf-8';
    try{
        charset = document.querySelector('meta[charset]').getAttribute('charset');
    }catch (e) {
    }
    const textDecoder = new TextDecoder(charset);
    const catalogueLen = catalogues.length;
    const allStoryStrArr=new Array(catalogueLen);
    //循环两两移除无用部分
    const removeSurplus = (i)=>{
        if(i>=catalogueLen)return;
        const s1 = allStoryStrArr[i];
        const s2 = allStoryStrArr[i+1];
        if(s1&&s2){
            let x='',b=true;
            //移除小说尾部相同的内容
            for(let i=1,l1=s1.length,l2=s2.length,l=1>l2?l2:l1;i<=l;++i){
                if(!b)break;
                const x1 = s1[l1-i];
                if(x1!==s2[l2-i])b=false;
                else x = x1+x;
            }
            if(x&&x.length<s1.length){
                const rmL = x.length
                for(let j=i;j<i+2;++j){
                    const str = allStoryStrArr[j];
                    allStoryStrArr[j] = str.substring(0,str.length-rmL);
                }
            }
            //移除小说头部相同的内容
            const t1= catalogues[i].innerText.trim(),t2=catalogues[i+1].innerText.trim();
            const titleIdx1 = allStoryStrArr[i].indexOf(t1);
            const titleIdx2 = allStoryStrArr[i+1].indexOf(t2);
            if(titleIdx1!==0&&titleIdx1!==0){
                if(titleIdx1>=1&&titleIdx1+t1.length<s1.length-100)
                    allStoryStrArr[i]=allStoryStrArr[i].substring(titleIdx1);
                if(titleIdx2>=1&&titleIdx2+t2.length<s1.length-100)
                    allStoryStrArr[i+1]=allStoryStrArr[i+1].substring(titleIdx2);
                x = "";
                b = true;
                for(let i=0,l1=s1.length,l2=s2.length,l=1>l2?l2:l1;i<l;++i){
                    if(!b)break;
                    const x1 = s1[i];
                    if(x1!==s2[i])b=false;
                    else x = x+x1;
                }
                if(x&&x.length<s1.length){
                    const rmL = x.length
                    allStoryStrArr[i] = allStoryStrArr[i].substring(rmL);
                    allStoryStrArr[i+1] = allStoryStrArr[i+1].substring(rmL);
                }
            }
        }
        removeSurplus(i+2);
    }
    //处理一下小说内容并下载
    const detailStory = ()=>{
        console.log("完成小说读取,正在准备处理内容");
        if(Array.isArray(removeString)){
            for(let i=0;i<catalogueLen;++i){
                for(const str of removeString)
                    if(str)
                        allStoryStrArr[i]=allStoryStrArr[i].replaceAll(str,"");
            }
        }else if(typeof removeString === 'string' && removeString.length>=1){
            for(let i=0;i<catalogueLen;++i){
                allStoryStrArr[i]=allStoryStrArr[i].replaceAll(removeString,"");
            }
        }
        removeSurplus(0);
        console.log('完成部分无关内容移除,正在添加章节换行');
        for(let i=0;i<catalogueLen;++i){
            if(!allStoryStrArr[i])continue;
            allStoryStrArr[i]=allStoryStrArr[i].replaceAll("\n\n","\r\n\r\n\n\n");
            allStoryStrArr[i]=allStoryStrArr[i]+"\r\n\r\n";
            const ti = catalogues[i].innerText.trim();
            if(allStoryStrArr[i].startsWith(ti)){
                allStoryStrArr[i] = allStoryStrArr[i].replace(ti,ti+"\r\n");
            }else{
                allStoryStrArr[i] = ti+"\r\n"+allStoryStrArr[i];
            }
        }
        console.log('小说内容处理完成,正在准备下载');
        const a = document.createElement('a');
        const name = storyTitle||'小说';
        const blob = new Blob([name,"\r\n\r\n\r\n",...allStoryStrArr], {type: 'text/plain'});
        const url = window.URL.createObjectURL(blob);
        const filename = name+".txt";
        a.href = url;
        a.download = filename;
        a.click();
        window.URL.revokeObjectURL(url);
    };
    //循环查找小说
    const doFetch = (idx)=>{
        const a = catalogues[idx];
        if(a&&a.href){
            const title = a.innerText.trim()
            fetch(a.href).then(r=>r.arrayBuffer()).then(b=>{
                let htmlStr = textDecoder.decode(b);
                const hStart='<html>',hEnd='</html>';
                const idxS = htmlStr.indexOf(hStart);
                const idxE = htmlStr.lastIndexOf(hEnd);
                if(idxS>=0 &&idxE>idxS)
                    htmlStr = htmlStr.substring(idxS+hStart.length,idxE);
                const html = document.createElement('html');
                html.innerHTML = htmlStr;
                for(;;){
                    //移除无关链接文本,脚本标签
                    let rmCount = 0;
                    const aArr = html.getElementsByTagName('a');
                    for(const inner of aArr){
                        inner.remove();
                        rmCount++;
                    }
                    const sArr = html.getElementsByTagName('script');
                    for(const inner of sArr){
                        inner.remove();
                        rmCount++;
                    }
                    if(rmCount===0)break;
                }
                allStoryStrArr[idx]=html.innerText;
                console.log("完成读取章节:"+title);

                if(idx<catalogueLen){
                    //如果被拦截就setTimeout(()=>doFetch(idx+1,max),300),等一段时间再下
                    doFetch(idx+1);
                }else detailStory();
            });
        }else{
            if(idx<catalogueLen){
                doFetch(idx+1);
            }else detailStory();
        }
    }
    doFetch(0);
}
simpleDownloadStoryInCatalogueWeb();
  • js抓取网络小说内容方法2
/** 在章节目录页或非章节内容页面创建此对象 */
class DownloadStory{
    /**
     * 全参构造下载小说对象
     * @param {string} cataloguesHtmlUrl 小说章节目录所在页面的url
     * @param {string} cataloguesCssSelector 章节目录所在dom的css选择器
     * @param {Array<string>} cataloguesStartWhihString 章节标题以哪些字开头(其中之一),默认第
     * @param {Array<string>} cataloguesMustContainString 章节标题必然包含哪些字(其中之一),默认章
     * @param {Array<string>} cataloguesMableContainStringEventNotHaveFirstTwo 章节标题可能包含哪些字,即使不包含前面两个字段内容(其中之一)
     * @param {string} contentCssSelector 章节内容所在dom的css选择器
     * @param {Array<string>} contentStartWithString 章节内容以什么开头,如果为空,则默认为以章节标题开头(其中之一)
     * @param {Array<string>} contentEndWithString 章节内容以什么结束(其中之一)
     * @param {Array<string>} contentRemoveString 章节内容中需要删除的字(全部)
     * @param {string} charset html页面的字符集,默认GBK
     * @param {string} title 小说标题
     */
    constructor(cataloguesHtmlUrl,
                cataloguesCssSelector,
                cataloguesStartWhihString,
                cataloguesMustContainString,
                cataloguesMableContainStringEventNotHaveFirstTwo,
                contentCssSelector,
                contentStartWithString,
                contentEndWithString,
                contentRemoveString,
                charset='GBK',title){
        if(title){
            this.title = title;
        }
        if(!cataloguesHtmlUrl){
            if(globalThis.location&&globalThis.location.href){
                cataloguesHtmlUrl = globalThis.location.href;
            }
            else throw new Error('请传入一个章节目录');
        }
        if(!Array.isArray(cataloguesStartWhihString))
            cataloguesStartWhihString = cataloguesStartWhihString?[cataloguesStartWhihString]:['第'];
        if(!Array.isArray(cataloguesMustContainString))
            cataloguesMustContainString = cataloguesMustContainString?[cataloguesMustContainString]:['章'];
        if(!Array.isArray(cataloguesMableContainStringEventNotHaveFirstTwo))
            cataloguesMableContainStringEventNotHaveFirstTwo = cataloguesMableContainStringEventNotHaveFirstTwo?[cataloguesMableContainStringEventNotHaveFirstTwo]:[];
        if(!Array.isArray(contentStartWithString))
            contentStartWithString = contentStartWithString?[contentStartWithString]:[];
        if(!Array.isArray(contentEndWithString))
            contentEndWithString = contentEndWithString?[contentEndWithString]:[];
        if(!Array.isArray(contentRemoveString))
            contentRemoveString = contentRemoveString?[contentRemoveString]:[];
        this.cataloguesHtmlUrl = cataloguesHtmlUrl;
        this.cataloguesHtmlUrlOrigin = new URL(cataloguesHtmlUrl).origin;
        this.cataloguesCssSelector = cataloguesCssSelector;
        this.cataloguesStartWhihString = cataloguesStartWhihString;
        this.cataloguesMustContainString = cataloguesMustContainString;
        this.cataloguesMableContainStringEventNotHaveFirstTwo = cataloguesMableContainStringEventNotHaveFirstTwo;
        this.contentCssSelector = contentCssSelector;
        this.contentStartWithString = contentStartWithString;
        this.contentEndWithString = contentEndWithString;
        this.contentRemoveString = contentRemoveString;
        this.charset = charset.toUpperCase();
        this.lastUsedCharset = this.charset;
        this.textDecoder = new TextDecoder(charset);
        this.lastUsedTextDecoder = this.textDecoder
    }

    /**
     * 下载小说
     */
    async down(){
        await this.findStoryCataloguesInHtml();
        const urls = this.allContentUrls;
        const titles = this.allContentTitles;
        if(urls==null||urls.length<=0) throw new Error('没找到小说内容url')
        const str = await this.getContentText(urls,titles);
        if(!str) throw new Error('没找到小说内容');
        const a = document.createElement('a');
        const name = this.title||'小说';
        const blob = new Blob([name+"\r\n\r\n\r\n"+str], {type: 'text/plain'});
        const url = window.URL.createObjectURL(blob);
        const filename = name+".txt";
        a.href = url;
        a.download = filename;
        a.click();
        window.URL.revokeObjectURL(url);
    }
    /**
     * 在小说目录页面查找小说全章节的链接
     */
    async findStoryCataloguesInHtml(){
        const html = await this.fetchHtmlContent(this.cataloguesHtmlUrl);
        if(!this.title){
            this.title = html.querySelector('title').innerText;
            this.allContextText = this.title+"\r\n\r\n";
        }
        let htmls = [];
        if(this.cataloguesCssSelector){
            const tmp = html.querySelectorAll(this.cataloguesCssSelector);
            for(const t of tmp){
                htmls.push(t);
            }
        }
        if(htmls.length<=0)htmls.push(html);
        const aSet = new Set();
        const sSet = new Set();
        for(const h of htmls){
            if(h.tagName.toUpperCase()==='A'){
                if(this.isCatalogue(h)){
                    aSet.add(this.getContentUrl(h));
                    sSet.add(h.innerText);
                }
            }else{
                const aList = h.querySelectorAll('a');
                for(const a of aList){
                    if(this.isCatalogue(a)){
                        aSet.add(this.getContentUrl(a));
                        sSet.add(a.innerText);
                    }
                }
            }
        }
        this.allContentUrls = [...aSet];
        this.allContentTitles = [...sSet];
    }
    /** 判断a标签是不是小说目录章节 */
    isCatalogue(a){
        if(!a.href)return false;
        const txt = a.innerText;
        if(!txt)return false;
        for(const start of this.cataloguesStartWhihString){
            if(txt.startsWith(start)){
                for(const c of this.cataloguesMustContainString){
                    if(txt.hasSubString(c)){
                        return true;
                    }
                }
            }
        }
        for(const s of this.cataloguesMableContainStringEventNotHaveFirstTwo){
            if(txt.hasSubString(s))return true;
        }
        return false;
    }
    /** 获取目录的url */
    getContentUrl(a){
        let href = a.href;
        if(href.startsWith('http')){
            if(globalThis.location&&globalThis.location.href)
                href = href.replace(globalThis.location.href,this.cataloguesHtmlUrl);
            return href;
        }
        else if(!href.startsWith('/'))
            return this.cataloguesHtmlUrl+"/"+href;
        else
            return this.cataloguesHtmlUrlOrigin+"/"+href;
    }
    /** 获取小说内容 */
    async getContentText(urls,titles){
        let str = '';
        const len = urls.length;
        const arr = new Array(len);
        for(let i=0;i<len;++i){
            arr[i]= await this.getContentTextSingle(urls[i],titles[i]);
        }
        for(const s of arr){
            str += s+"\r\n";
        }
        return str;
    }
    async getContentTextSingle(url,title){
        const html = await this.fetchHtmlContent(url);
        let htmls;
        if(this.contentCssSelector){
            htmls = html.querySelectorAll(this.contentCssSelector);
        }else htmls = [html];
        let str = '';
        for(let i=0,l=htmls.length;i<l;++i){
            const h = htmls[i];
            let txt = h.textContent;
            if(i===0){
                let startArr;debugger
                if(this.contentStartWithString&&this.contentStartWithString.length>=1){
                    startArr = this.contentStartWithString;
                    if(!title)title = startArr[0];
                }else if(title) startArr = [title];
                if(startArr){
                    let startNot = true;
                    for(const start of startArr){
                        const startI = txt.indexOf(start);
                        if(startI>=0){
                            if(startI>=1)
                                txt = txt.substring(startI);
                            startNot = false;
                            break;
                        }
                    }
                    if(startNot&&title)
                        txt = title+"\r\n"+txt;
                }
                if(this.contentEndWithString&&this.contentEndWithString.length>=1){
                    for(const end of this.contentEndWithString){
                        const endI = txt.lastIndexOf(end);
                        let start = 1;
                        if(title) start = title.length;
                        if(endI>start){
                            txt = txt.substring(0,endI);
                            break;
                        }
                    }
                }
                if(this.contentRemoveString&&this.contentRemoveString.length>=1){
                    for(const rmStr of this.contentRemoveString){
                        txt = txt.replaceAll(rmStr,'');
                    }
                }
            }
            str += " \r\n" + txt;
        }
        str = str+"\r\n";
        console.log(str);
        return str+"\r\n";
    }
    /**
     * 以GET请求的方式拉取网页内容
     * @param {string} htmlUrl
     * @returns 完整html的dom
     */
    async fetchHtmlContent(htmlUrl){
        if(!htmlUrl)throw new Error("没有传入htmlUrl");
        const resp = await fetch(htmlUrl);
        if(resp.ok){
            let buffer = await resp.arrayBuffer();
            let text = this.lastUsedTextDecoder.decode(buffer);
            const hStart='<html>',hEnd='</html>';
            text = text.substring(text.indexOf(hStart)+hStart.length,text.lastIndexOf(hEnd));
            const html = document.createElement('html');
            html.innerHTML = text;
            //如果页面上有字符集的meta则按照页面上字符集的meta重新编码
            const charsetEl = html.querySelector('meta[charset]')
            if(charsetEl){
                const charset = charsetEl.getAttribute('charset');
                if(charset){
                    const newCharset = charset.toUpperCase();
                    if(!newCharset===this.lastUsedCharset){
                        this.lastUsedCharset = newCharset;
                        const decoderName = 'textDecoder'+newCharset;
                        if(!this[decoderName]){
                            this[decoderName] = new TextDecoder(newCharset);
                        }
                        this.lastUsedTextDecoder = this[decoderName];
                        text = this[decoderName].decode(buffer);
                        text = text.substring(text.indexOf(hStart)+hStart.length,text.lastIndexOf(hEnd));
                        html.innerHTML = text;
                    }
                }
            }
            return html;
        }else{
            throw new Error('获取页面内容失败,url:'+htmlUrl);
        }
    }
}

//测试,下载遮天
var a =new DownloadStory('https://www.bbiquge.net/book_967/',//目录,只传这个也能下载,默认章节名以第开头且包含章
    '.zjlist',//章节目录所在的css选择器
    null,null,null,//第,章,null
    '#readbox',//章节正文所在的css选择器
    null,
    ["上一章","下一章"],//章节正文结束标志字符串
    ['jx();','笔趣阁 www.bbiquge.net,最快更新你想干什么最新章节!','hf();']//章节正文中要删除的字符串
);
a.down();
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章