js抓取網絡小說內容下載

  • js抓取網絡小說內容方法1
/**
 * 在小說目錄頁面按F12打開控制檯,複製粘貼,
 * 執行本函數下載目錄中的小說章節,
 * 參數removeString是你想要刪除的字符串,也可以是字符串數組
 */
function simpleDownloadStoryInCatalogueWeb(removeString=""){
    //目錄頁url
    const catalogueWebUrl = document.location.origin+document.location.pathname;
    //小說名稱
    const storyTitle = document.querySelector('title').innerText;
    //找到所有的a標籤
    const aAll = document.getElementsByTagName('a');
    const cataloguesTmp = [];
    for(const a of aAll){
        //篩選出href以目錄頁開頭的a標籤即認爲是章節目錄
        if(a&&a.href&&a.href.startsWith(catalogueWebUrl))
            cataloguesTmp.push(a);
    }
    //移除所有重複的a標籤
    for(let l=cataloguesTmp.length,i=l-1;i>=0;--i){
        const aHref = cataloguesTmp[i]&&cataloguesTmp[i].href;
        if(aHref)
            for (let j=0;j<i;++j){
                const aj = cataloguesTmp[j];
                if(aj&&aHref===aj.href)
                    cataloguesTmp[j]=null;
            }
    }
    const catalogues = cataloguesTmp.filter(a=>a);
    let charset = 'utf-8';
    try{
        charset = document.querySelector('meta[charset]').getAttribute('charset');
    }catch (e) {
    }
    const textDecoder = new TextDecoder(charset);
    const catalogueLen = catalogues.length;
    const allStoryStrArr=new Array(catalogueLen);
    //循環兩兩移除無用部分
    const removeSurplus = (i)=>{
        if(i>=catalogueLen)return;
        const s1 = allStoryStrArr[i];
        const s2 = allStoryStrArr[i+1];
        if(s1&&s2){
            let x='',b=true;
            //移除小說尾部相同的內容
            for(let i=1,l1=s1.length,l2=s2.length,l=1>l2?l2:l1;i<=l;++i){
                if(!b)break;
                const x1 = s1[l1-i];
                if(x1!==s2[l2-i])b=false;
                else x = x1+x;
            }
            if(x&&x.length<s1.length){
                const rmL = x.length
                for(let j=i;j<i+2;++j){
                    const str = allStoryStrArr[j];
                    allStoryStrArr[j] = str.substring(0,str.length-rmL);
                }
            }
            //移除小說頭部相同的內容
            const t1= catalogues[i].innerText.trim(),t2=catalogues[i+1].innerText.trim();
            const titleIdx1 = allStoryStrArr[i].indexOf(t1);
            const titleIdx2 = allStoryStrArr[i+1].indexOf(t2);
            if(titleIdx1!==0&&titleIdx1!==0){
                if(titleIdx1>=1&&titleIdx1+t1.length<s1.length-100)
                    allStoryStrArr[i]=allStoryStrArr[i].substring(titleIdx1);
                if(titleIdx2>=1&&titleIdx2+t2.length<s1.length-100)
                    allStoryStrArr[i+1]=allStoryStrArr[i+1].substring(titleIdx2);
                x = "";
                b = true;
                for(let i=0,l1=s1.length,l2=s2.length,l=1>l2?l2:l1;i<l;++i){
                    if(!b)break;
                    const x1 = s1[i];
                    if(x1!==s2[i])b=false;
                    else x = x+x1;
                }
                if(x&&x.length<s1.length){
                    const rmL = x.length
                    allStoryStrArr[i] = allStoryStrArr[i].substring(rmL);
                    allStoryStrArr[i+1] = allStoryStrArr[i+1].substring(rmL);
                }
            }
        }
        removeSurplus(i+2);
    }
    //處理一下小說內容並下載
    const detailStory = ()=>{
        console.log("完成小說讀取,正在準備處理內容");
        if(Array.isArray(removeString)){
            for(let i=0;i<catalogueLen;++i){
                for(const str of removeString)
                    if(str)
                        allStoryStrArr[i]=allStoryStrArr[i].replaceAll(str,"");
            }
        }else if(typeof removeString === 'string' && removeString.length>=1){
            for(let i=0;i<catalogueLen;++i){
                allStoryStrArr[i]=allStoryStrArr[i].replaceAll(removeString,"");
            }
        }
        removeSurplus(0);
        console.log('完成部分無關內容移除,正在添加章節換行');
        for(let i=0;i<catalogueLen;++i){
            if(!allStoryStrArr[i])continue;
            allStoryStrArr[i]=allStoryStrArr[i].replaceAll("\n\n","\r\n\r\n\n\n");
            allStoryStrArr[i]=allStoryStrArr[i]+"\r\n\r\n";
            const ti = catalogues[i].innerText.trim();
            if(allStoryStrArr[i].startsWith(ti)){
                allStoryStrArr[i] = allStoryStrArr[i].replace(ti,ti+"\r\n");
            }else{
                allStoryStrArr[i] = ti+"\r\n"+allStoryStrArr[i];
            }
        }
        console.log('小說內容處理完成,正在準備下載');
        const a = document.createElement('a');
        const name = storyTitle||'小說';
        const blob = new Blob([name,"\r\n\r\n\r\n",...allStoryStrArr], {type: 'text/plain'});
        const url = window.URL.createObjectURL(blob);
        const filename = name+".txt";
        a.href = url;
        a.download = filename;
        a.click();
        window.URL.revokeObjectURL(url);
    };
    //循環查找小說
    const doFetch = (idx)=>{
        const a = catalogues[idx];
        if(a&&a.href){
            const title = a.innerText.trim()
            fetch(a.href).then(r=>r.arrayBuffer()).then(b=>{
                let htmlStr = textDecoder.decode(b);
                const hStart='<html>',hEnd='</html>';
                const idxS = htmlStr.indexOf(hStart);
                const idxE = htmlStr.lastIndexOf(hEnd);
                if(idxS>=0 &&idxE>idxS)
                    htmlStr = htmlStr.substring(idxS+hStart.length,idxE);
                const html = document.createElement('html');
                html.innerHTML = htmlStr;
                for(;;){
                    //移除無關鏈接文本,腳本標籤
                    let rmCount = 0;
                    const aArr = html.getElementsByTagName('a');
                    for(const inner of aArr){
                        inner.remove();
                        rmCount++;
                    }
                    const sArr = html.getElementsByTagName('script');
                    for(const inner of sArr){
                        inner.remove();
                        rmCount++;
                    }
                    if(rmCount===0)break;
                }
                allStoryStrArr[idx]=html.innerText;
                console.log("完成讀取章節:"+title);

                if(idx<catalogueLen){
                    //如果被攔截就setTimeout(()=>doFetch(idx+1,max),300),等一段時間再下
                    doFetch(idx+1);
                }else detailStory();
            });
        }else{
            if(idx<catalogueLen){
                doFetch(idx+1);
            }else detailStory();
        }
    }
    doFetch(0);
}
simpleDownloadStoryInCatalogueWeb();
  • js抓取網絡小說內容方法2
/** 在章節目錄頁或非章節內容頁面創建此對象 */
class DownloadStory{
    /**
     * 全參構造下載小說對象
     * @param {string} cataloguesHtmlUrl 小說章節目錄所在頁面的url
     * @param {string} cataloguesCssSelector 章節目錄所在dom的css選擇器
     * @param {Array<string>} cataloguesStartWhihString 章節標題以哪些字開頭(其中之一),默認第
     * @param {Array<string>} cataloguesMustContainString 章節標題必然包含哪些字(其中之一),默認章
     * @param {Array<string>} cataloguesMableContainStringEventNotHaveFirstTwo 章節標題可能包含哪些字,即使不包含前面兩個字段內容(其中之一)
     * @param {string} contentCssSelector 章節內容所在dom的css選擇器
     * @param {Array<string>} contentStartWithString 章節內容以什麼開頭,如果爲空,則默認爲以章節標題開頭(其中之一)
     * @param {Array<string>} contentEndWithString 章節內容以什麼結束(其中之一)
     * @param {Array<string>} contentRemoveString 章節內容中需要刪除的字(全部)
     * @param {string} charset html頁面的字符集,默認GBK
     * @param {string} title 小說標題
     */
    constructor(cataloguesHtmlUrl,
                cataloguesCssSelector,
                cataloguesStartWhihString,
                cataloguesMustContainString,
                cataloguesMableContainStringEventNotHaveFirstTwo,
                contentCssSelector,
                contentStartWithString,
                contentEndWithString,
                contentRemoveString,
                charset='GBK',title){
        if(title){
            this.title = title;
        }
        if(!cataloguesHtmlUrl){
            if(globalThis.location&&globalThis.location.href){
                cataloguesHtmlUrl = globalThis.location.href;
            }
            else throw new Error('請傳入一個章節目錄');
        }
        if(!Array.isArray(cataloguesStartWhihString))
            cataloguesStartWhihString = cataloguesStartWhihString?[cataloguesStartWhihString]:['第'];
        if(!Array.isArray(cataloguesMustContainString))
            cataloguesMustContainString = cataloguesMustContainString?[cataloguesMustContainString]:['章'];
        if(!Array.isArray(cataloguesMableContainStringEventNotHaveFirstTwo))
            cataloguesMableContainStringEventNotHaveFirstTwo = cataloguesMableContainStringEventNotHaveFirstTwo?[cataloguesMableContainStringEventNotHaveFirstTwo]:[];
        if(!Array.isArray(contentStartWithString))
            contentStartWithString = contentStartWithString?[contentStartWithString]:[];
        if(!Array.isArray(contentEndWithString))
            contentEndWithString = contentEndWithString?[contentEndWithString]:[];
        if(!Array.isArray(contentRemoveString))
            contentRemoveString = contentRemoveString?[contentRemoveString]:[];
        this.cataloguesHtmlUrl = cataloguesHtmlUrl;
        this.cataloguesHtmlUrlOrigin = new URL(cataloguesHtmlUrl).origin;
        this.cataloguesCssSelector = cataloguesCssSelector;
        this.cataloguesStartWhihString = cataloguesStartWhihString;
        this.cataloguesMustContainString = cataloguesMustContainString;
        this.cataloguesMableContainStringEventNotHaveFirstTwo = cataloguesMableContainStringEventNotHaveFirstTwo;
        this.contentCssSelector = contentCssSelector;
        this.contentStartWithString = contentStartWithString;
        this.contentEndWithString = contentEndWithString;
        this.contentRemoveString = contentRemoveString;
        this.charset = charset.toUpperCase();
        this.lastUsedCharset = this.charset;
        this.textDecoder = new TextDecoder(charset);
        this.lastUsedTextDecoder = this.textDecoder
    }

    /**
     * 下載小說
     */
    async down(){
        await this.findStoryCataloguesInHtml();
        const urls = this.allContentUrls;
        const titles = this.allContentTitles;
        if(urls==null||urls.length<=0) throw new Error('沒找到小說內容url')
        const str = await this.getContentText(urls,titles);
        if(!str) throw new Error('沒找到小說內容');
        const a = document.createElement('a');
        const name = this.title||'小說';
        const blob = new Blob([name+"\r\n\r\n\r\n"+str], {type: 'text/plain'});
        const url = window.URL.createObjectURL(blob);
        const filename = name+".txt";
        a.href = url;
        a.download = filename;
        a.click();
        window.URL.revokeObjectURL(url);
    }
    /**
     * 在小說目錄頁面查找小說全章節的鏈接
     */
    async findStoryCataloguesInHtml(){
        const html = await this.fetchHtmlContent(this.cataloguesHtmlUrl);
        if(!this.title){
            this.title = html.querySelector('title').innerText;
            this.allContextText = this.title+"\r\n\r\n";
        }
        let htmls = [];
        if(this.cataloguesCssSelector){
            const tmp = html.querySelectorAll(this.cataloguesCssSelector);
            for(const t of tmp){
                htmls.push(t);
            }
        }
        if(htmls.length<=0)htmls.push(html);
        const aSet = new Set();
        const sSet = new Set();
        for(const h of htmls){
            if(h.tagName.toUpperCase()==='A'){
                if(this.isCatalogue(h)){
                    aSet.add(this.getContentUrl(h));
                    sSet.add(h.innerText);
                }
            }else{
                const aList = h.querySelectorAll('a');
                for(const a of aList){
                    if(this.isCatalogue(a)){
                        aSet.add(this.getContentUrl(a));
                        sSet.add(a.innerText);
                    }
                }
            }
        }
        this.allContentUrls = [...aSet];
        this.allContentTitles = [...sSet];
    }
    /** 判斷a標籤是不是小說目錄章節 */
    isCatalogue(a){
        if(!a.href)return false;
        const txt = a.innerText;
        if(!txt)return false;
        for(const start of this.cataloguesStartWhihString){
            if(txt.startsWith(start)){
                for(const c of this.cataloguesMustContainString){
                    if(txt.hasSubString(c)){
                        return true;
                    }
                }
            }
        }
        for(const s of this.cataloguesMableContainStringEventNotHaveFirstTwo){
            if(txt.hasSubString(s))return true;
        }
        return false;
    }
    /** 獲取目錄的url */
    getContentUrl(a){
        let href = a.href;
        if(href.startsWith('http')){
            if(globalThis.location&&globalThis.location.href)
                href = href.replace(globalThis.location.href,this.cataloguesHtmlUrl);
            return href;
        }
        else if(!href.startsWith('/'))
            return this.cataloguesHtmlUrl+"/"+href;
        else
            return this.cataloguesHtmlUrlOrigin+"/"+href;
    }
    /** 獲取小說內容 */
    async getContentText(urls,titles){
        let str = '';
        const len = urls.length;
        const arr = new Array(len);
        for(let i=0;i<len;++i){
            arr[i]= await this.getContentTextSingle(urls[i],titles[i]);
        }
        for(const s of arr){
            str += s+"\r\n";
        }
        return str;
    }
    async getContentTextSingle(url,title){
        const html = await this.fetchHtmlContent(url);
        let htmls;
        if(this.contentCssSelector){
            htmls = html.querySelectorAll(this.contentCssSelector);
        }else htmls = [html];
        let str = '';
        for(let i=0,l=htmls.length;i<l;++i){
            const h = htmls[i];
            let txt = h.textContent;
            if(i===0){
                let startArr;debugger
                if(this.contentStartWithString&&this.contentStartWithString.length>=1){
                    startArr = this.contentStartWithString;
                    if(!title)title = startArr[0];
                }else if(title) startArr = [title];
                if(startArr){
                    let startNot = true;
                    for(const start of startArr){
                        const startI = txt.indexOf(start);
                        if(startI>=0){
                            if(startI>=1)
                                txt = txt.substring(startI);
                            startNot = false;
                            break;
                        }
                    }
                    if(startNot&&title)
                        txt = title+"\r\n"+txt;
                }
                if(this.contentEndWithString&&this.contentEndWithString.length>=1){
                    for(const end of this.contentEndWithString){
                        const endI = txt.lastIndexOf(end);
                        let start = 1;
                        if(title) start = title.length;
                        if(endI>start){
                            txt = txt.substring(0,endI);
                            break;
                        }
                    }
                }
                if(this.contentRemoveString&&this.contentRemoveString.length>=1){
                    for(const rmStr of this.contentRemoveString){
                        txt = txt.replaceAll(rmStr,'');
                    }
                }
            }
            str += " \r\n" + txt;
        }
        str = str+"\r\n";
        console.log(str);
        return str+"\r\n";
    }
    /**
     * 以GET請求的方式拉取網頁內容
     * @param {string} htmlUrl
     * @returns 完整html的dom
     */
    async fetchHtmlContent(htmlUrl){
        if(!htmlUrl)throw new Error("沒有傳入htmlUrl");
        const resp = await fetch(htmlUrl);
        if(resp.ok){
            let buffer = await resp.arrayBuffer();
            let text = this.lastUsedTextDecoder.decode(buffer);
            const hStart='<html>',hEnd='</html>';
            text = text.substring(text.indexOf(hStart)+hStart.length,text.lastIndexOf(hEnd));
            const html = document.createElement('html');
            html.innerHTML = text;
            //如果頁面上有字符集的meta則按照頁面上字符集的meta重新編碼
            const charsetEl = html.querySelector('meta[charset]')
            if(charsetEl){
                const charset = charsetEl.getAttribute('charset');
                if(charset){
                    const newCharset = charset.toUpperCase();
                    if(!newCharset===this.lastUsedCharset){
                        this.lastUsedCharset = newCharset;
                        const decoderName = 'textDecoder'+newCharset;
                        if(!this[decoderName]){
                            this[decoderName] = new TextDecoder(newCharset);
                        }
                        this.lastUsedTextDecoder = this[decoderName];
                        text = this[decoderName].decode(buffer);
                        text = text.substring(text.indexOf(hStart)+hStart.length,text.lastIndexOf(hEnd));
                        html.innerHTML = text;
                    }
                }
            }
            return html;
        }else{
            throw new Error('獲取頁面內容失敗,url:'+htmlUrl);
        }
    }
}

//測試,下載遮天
var a =new DownloadStory('https://www.bbiquge.net/book_967/',//目錄,只傳這個也能下載,默認章節名以第開頭且包含章
    '.zjlist',//章節目錄所在的css選擇器
    null,null,null,//第,章,null
    '#readbox',//章節正文所在的css選擇器
    null,
    ["上一章","下一章"],//章節正文結束標誌字符串
    ['jx();','筆趣閣 www.bbiquge.net,最快更新你想幹什麼最新章節!','hf();']//章節正文中要刪除的字符串
);
a.down();
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章