这是我参与8月更文挑战的第5天，活动详情查看：8月更文挑战

前言

作为一个有6年书龄的程序员，不搞一波爬虫来看小说真是对不起自己。

所以，我就又开始了愉快的搞事情之旅。

不过接下来我要做的是在线转码，也就是说不用保存数据库，这样就不用考虑数据库了，23333。

阅读这篇文章需要一定基础，我不会讲得很详细…

这是这个在线转码项目的github地址 github.com/jialeliang/…

什么是在线转码

在线转码就是用户需要请求哪个页面，我就让服务器请求那个页面然后解析返回我想要的数据。

效果展示

可以看出我是有使用缓存的，缓存使用了redis。

需要哪些依赖项

async 队列执行
axios 这个貌似没用到，一个ajax请求的库
cherrio 类似jq的库
iconv-lite 转码，请求页面会乱码，所以要用这个来转码
koa-bodyparse koa获取post参数的库
koa-router koa路由
koa-static koa静态资源服务器
koa2 node框架
nodemon node开发的调试工具，自动刷新服务
pm2 管理node服务
redis 操作redis
request 模拟请求页面
socket.io socket服务

首先我们需要启动一个koa服务

// server/index.js
// 服务配置
const { server } = require('../config')
// node的path模块
const { path } = require('../common')
// 这是路由
const router = require('./router')
const static = require('koa-static')
const Koa = require('koa2');
const bodyParser = require('koa-bodyparser')
const app = new Koa();

// 加载静态资源服务
app.use(static(path.join(global.ROOTPATH, './view')))
app.use(bodyParser());

class Server{
    constructor(){
        this.port = server.port;
        this.app = app;
        this.router = router;
    }

    start(){
        // 使用路由
        this.app
            .use(this.router.routes())
            .use(this.router.allowedMethods())

        this.app.listen(this.port, () => {
            console.log(`Koa服务已启动，打开地址: http://${server.host}:${server.port}`)
        });
    }
}

module.exports = Server;

// bin/www
// 启动服务
let server = new Server();
server.start();
复制代码

封装请求页面的函数

// userAgent是不同终端的headers信息
const { request, userAgent } = require('../config')
const rq = require('request')

module.exports = (config = {}) => {
    // request是config/request中的配置文件，config是每个不同api使用的配置
    config = Object.assign(config, request);
    return new Promise((resolve, reject) => {
        // 判断是否拥有http，没有就添加, 这里还需要判断uri是否为字符串
        let prefix = config.uri.slice(0, 4);
        if(prefix != 'http') config.uri = 'http://' + config.uri;
        
        // 添加headers信息，因为有的网站会有pc、pe端的判断，所以需要加上这个，并且还有30秒的搜索限制，加上这个就没有这个问题了
        let agent = userAgent[config.agent || 'pc'];
        config.headers['User-Agent'] = agent[Math.floor(agent.length * Math.random())];

        rq(config, (err, res, body) => {
            if(err){
                return reject(err);
            }
            // 不同页面也会有不同的处理方法
            resolve(config.transform ? config.transform(body, res) : body);
        })
    });
};
复制代码

接下来，正式开始编写爬虫程序

首先，我们需要写一个路由：

// 这是爬取数据的方法，下面再介绍
const { searchBook } = require('../scripts')
const { toJson } = require(global.ROOTPATH + '/common')
// 这个是请求页面获取数据的dom选择器
const { origin } = require(global.ROOTPATH + '/config')

const Router = require('koa-router')
const router = new Router();

// 首页中需要获取有哪些来源，请求这个路由就可以返回给前端了
router.get('/origin', (ctx, next) => {
    ctx.body = toJson(origin, 'success');
})

// 前端调用了这个路由，路由就会执行爬虫程序
router.post('/book/search', async (ctx, next) => {
    let data = ctx.request.body;
    // 调用爬虫，第一个参数是搜索参数，第二个参数是需要请求的来源
    // 第三个参数是socket的id，因为我需要socket单独发给这个用户，而不是所有用户，所以就传递了这个id
    let result = await searchBook(data.name, data.origins, data.socketId);
    ctx.body = toJson(result, 'success');
})

module.exports = router;
复制代码

看一下来源配置中的数据:

module.exports = {
    biquge: {
        // 基础设置
        name: '笔趣阁',
        href: 'http://www.qbiqu.com',
        search: 'http://www.qbiqu.com/modules/article/search.php?searchkey={search}',
        code: 'gbk',
        device: 'pc',
        status: true,

        // 搜索dom获取
        searchCode: 'gbk',
        searchList: 'tbody>tr',
        searchIndex: 1,
        searchTitle: 'td/a/eq-0/text',
        searchHref: 'td/a/eq-0/attrhref',
        searchAuthor: 'td/eq-2/text',
        searchNewChapter: 'td/eq-1/text',
        searchUpdated: 'td/eq-4/text',

        // 详情页dom获取
        infoTitle: '#maininfo h1/text',
        infoAuthor: '#maininfo p/eq-0/text/allSpace/not-作者：',
        infoNewChapter: '#maininfo p/eq-3/text/not-最新章节：',
        infoUpdated: '#maininfo p/eq-2/text/not-最后更新：',
        infoImage: '#fmimg img/attrsrc/addhref',
        infoDescription: '#intro p/eq-0/html/allSpace',
        infoChapterList: '#list dd',
        infoChapterIndex: 9,
        infoChapterTitle: 'a/text',
        infoChapterHref: 'a/attrhref/addhref',

        // 章节页dom获取
        chapterTitle: 'h1/text',
        chapterContent: '#content/remove-div/html',
        chapterPrevHref: '.bottem2 a/eq-2/attrhref/addhref',
        chapterNextHref: '.bottem2 a/eq-4/attrhref/addhref'
    },
}
复制代码

因为别人网站中dom要么摆放得花里胡哨，要么就是不是我想要的数据，如果在程序中专门为每一个数据处理的话，那将会花费大量的时间，程序也会因此变得臃肿，所以就需要为此专门写了一个处理这些问题的方法

上面配置中值的数据可以清楚的看到有//attrhref/addhref/eq-2/html/text/allSpace/not-*/remove-*这些css选择器中没有的参数，这就需要自己去编写处理函数了。

这里贴一下处理这些的函数, /是代表分隔后面的那些参数，给一开始获取的dom一个个的往下处理。

parse(dom, selector){
    let arr = selector.split('/');
    let result = null;
    arr.map(item => {
        let arr = item.split('-');
        switch(arr[0]){
            case '':
                break;
            case 'html':
                result = result.html();
                break;
            case 'text':
                result = result.text();
                break;
            case 'attrhref':
                result = result.attr('href');
                break;
            case 'attrsrc':
                result = result.attr('src');
                break;
            case 'eq':
                result = result.eq(arr[1]);
                break;
            case 'addhref':
                result = result ? this.get('href') + result : '';
                break;
            case 'not':
                result = typeof result == 'string' ? result.replace(arr[1], '') : result;
                break;
            case 'allSpace':
                result = typeof result == 'string' ? result.replace(/\s/g, '') : result;
                break;
            case 'remove':
                result.find(arr[1]).remove();
                break;
            default:
                result = dom.find(item);
                break;
        }
    })
    return result;
}
复制代码

这样基本就可以获取到想要的数据了，如果还有什么需要处理的，再往上面加就可以。

然后就是获取页面的函数了

// 这里解释一下Queue、Dom是什么东西
// Queue是用async封装的队列类，new Queue 就创建了一个队列，queue.push({}) 往队列中添加任务
// Dom是用来解析页面的数据的类，只要把dom扔进去，然后调用getSearchList就可以返回相关的数据
const { Rq, cheerio, iconv, Queue, Dom, checkReplite, redis, ws } = require(global.ROOTPATH + '/common')
const { origin } = require(global.ROOTPATH + '/config')

function emitMsg(id, flag){
    if(!id) return;
    ws.emit('searchResult', flag, id);
}

function search(name, replite, key, id){
    // 先创建dom类实例，把来源集合扔进去
    let dom = new Dom(replite);
    // 处理请求路径，有的网站只能解析gbk编码的中文
    let href = dom.setHref(name);
    // console.log(`爬取《${name}》中, 来源${replite.name}, 地址: ` + href);
    return new Promise((resolve, reject) => {
        Rq({
            uri: href,
            transform(body, response){
                return {
                    // 解析返回的页面数据为$对象
                    $: dom.transform(body),
                    path: response.request.href
                };
            }
        })
        .then((obj) => {
            let $ = obj.$;
            dom.load($);
            // 然后就可以愉快得调用getSearchList来获取想要的数据了
            // 具体的方面后面再讲
            let data = dom.getSearchList(obj.path);
            emitMsg(id, 1);

            resolve({
                url: href,
                data,
                origin: dom.get('name'),
                originKey: key
            });
        })
        .catch(err => {
            console.error(`请求 《${name}》${replite.name} 时发生错误: ` + err);
            emitMsg(id, 0);
            reject('请求' + replite.name + '时错误，错误代码: ' + (err.code || err));
        })
    })
}

module.exports = (name = '', origins = [], socketId) => {
    // 为每个请求都创建一个队列，这样就不会影响到其他人了
    const queue = new Queue('搜索小说');
    return new Promise(async resolve => {
        if(name == '' || origins.length <= 0) {
            resolve({
                msg: '参数错误'
            });
        } else {
            // 判断是否有redis缓存数据
            let redisSearchData = await redis.get('data', `search-${name}`);
            if(redisSearchData){
                resolve(Object.assign(JSON.parse(redisSearchData), {
                    isRedis: true
                }));
                return;
            }

            let result = [];
            let errors = [];

            // 将每个来源都调用一次search函数，并将其扔进队列中
            origins.forEach((key) => {
                let replite = origin[key];
                if(replite && checkReplite(replite)){
                    // 这个写法是在Queue中封装好的
                    queue.push({
                        //  给fn函数提供的参数
                        params: [name, replite, key, socketId],
                        // 队列执行search函数
                        fn: search,
                        // search返回成功
                        async success(data){
                            result.push(data);
                        },
                        // search返回失败
                        error(err){
                            errors.push(err);
                        }
                    });
                } else {
                    errors.push(`不能使用${replite.name}的来源`);
                }
            });
            // 所有任务执行完毕执行
            queue.end(async () => {
                let data = {
                    result, errors
                };
                // 缓存一天
                await redis.set('data', `search-${name}`, JSON.stringify(data), 60 * 60 * 24);
                resolve(data)
            })
        }
    })
}
复制代码

下面讲一下Queue类的封装

const async = require('async')

let len = 1;

class Queue{
    constructor(name, maxLength = 10){
        this.name = name || len++;
        this.async = async;
        this.maxLength = maxLength
        this.bind();
    }

    // 主要的就是这个方法
    // 仔细看一看还是能看懂的
    bind(){
        this.async = this.async.queue((obj, callback) => {
            // 将参数传递给fn
            obj.fn.apply(this, obj.params).then(async res => {
                // fn成功
                obj.success && obj.success.apply(this, [res]);
                await callback();
            }).catch(async err => {
                // fn失败
                obj.error && obj.error.apply(this, [err]);
                await callback(err);
            })
        }, this.maxLength);
    }

    push(obj){
        this.async.push(obj, (err) => {
            if(err)
                console.log('运行 ' + this.name + ' 队列时发生错误: ' + err + '，错误时间: ' + new Date());
        })
    }

    get length(){
        return this.async.length();
    }

    end(fn){
        this.async.drain(() => {
            let date = new Date();
            console.log(`队列 ${this.name} 执行完毕, 完成时间: ` + date.toLocaleDateString() + ' ' + date.toLocaleTimeString());
            fn();
        });
    }
}

module.exports = Queue;
复制代码

然后就是Dom的getSearchList方法，和getDom方法

// 获取origin中key为name 的值，如果没有值的话，将会返回val值
get(name, val = ''){
    return this.replite[name] || val;
}

// 使用origin中的值解析获取相对应得值
getDom(dom, name){
    dom = dom || this.$('body');
    return this.parse(dom, this.get(name));
}

getSearchList(path){
    let list = [];
    // 有可能会重定向，path是之前setHref保存在实例中数据，重定向后直接从重定向后的页面中获取数据
    if(path != this.href){
        return [{
            title: this.getDom(null, 'infoTitle'),
            href: path,
            author: this.getDom(null, 'infoAuthor'),
            newChapter: this.getDom(null, 'infoNewChapter'),
            updated: this.getDom(null, 'infoUpdated')
        }];
    } else {
        // 获取所有的结果数据
        let searchList = this.$(this.get('searchList'));
        for(let i = this.get('searchIndex', 0); i < searchList.length; i++){
            let item = searchList.eq(i);
            let data = {
                title: this.getDom(item, 'searchTitle'),
                href: this.getDom(item, 'searchHref'),
                author: this.getDom(item, 'searchAuthor'),
                newChapter: this.getDom(item, 'searchNewChapter'),
                updated: this.getDom(item, 'searchUpdated')
            };
            list.push(data);
        }
        return list;
    }
}
复制代码