给掘金做了一个数据统计分析工具-一一网

本文已参与好文召集令活动，点击查看：后端、大前端双赛道投稿，2万元奖池等你挑战！

先看看成果

7月1日数据为例

7月1日总获赞数Top10

用户	总获赞
掘金安东尼	234
chokcoco	185
手撕红黑树	148
海拥	114
程序员小杰	104
月伴飞鱼	103
LBJ	100
淘系前端团队	92
岛上码农	89
云的世界	80

7月1日总浏览时Top10

用户	总浏览
手撕红黑树	6037
chokcoco	5689
洛竹	5495
呆呆敲代码的小Y	5254
华为开发者论坛	4829
掘金安东尼	4445
俺老刘	3967
海拥	3784
alphardex	3459
淘系前端团队	3448

7月1日单时间段获赞Top10

用户	时间段	获赞
月伴飞鱼	07-01 00:41 – 07-01 00:46	45
程序员小杰	07-01 09:59 – 07-01 10:04	37
红尘炼心	07-01 21:42 – 07-01 21:47	27
LBJ	07-01 21:32 – 07-01 21:37	26
胖DA	07-01 17:41 – 07-01 17:46	20
小村儿	07-01 01:31 – 07-01 01:36	17
岛上码农	07-01 08:43 – 07-01 08:49	15
潇雷	07-01 22:07 – 07-01 22:12	13
在剥我的壳	07-01 17:31 – 07-01 17:36	10
俺老刘	07-01 19:01 – 07-01 19:06	10

7月1日单时间段浏览Top10

用户	时间段	获浏览
华为开发者论坛	07-01 11:14 – 07-01 11:19	874
摸鱼专家	07-01 15:25 – 07-01 20:52	210
Honest1y	07-01 16:45 – 07-01 16:50	157
mPaaS	07-01 11:44 – 07-01 20:06	103
俺老刘	07-01 13:15 – 07-01 13:20	101
洛竹	07-01 09:44 – 07-01 09:49	89
chokcoco	07-01 13:47 – 07-01 13:55	89
手撕红黑树	07-01 09:34 – 07-01 09:39	89
alphardex	07-01 10:24 – 07-01 10:29	85
爱鼓捣的程序猿	07-01 11:29 – 07-01 11:34	84

数据准不准? 答：非常准确

是不是很好奇怎么做到的？

开搞

监控作者，拉取数据，哪里找作者，这里用的是作者排行榜，
- 缺点是会丢一些没上榜的作者

抓取数据

我们看到有好多分类，所以先抓分类
这个就不多解释，看代码

    private static List<Category> getAllCategory() {
        String res = Http.get("https://api.juejin.cn/tag_api/v1/query_category_briefs?show_type=1");
        //取到所以标签
        JSONArray data = JSONUtil.parseObj(res).getJSONArray("data");
        return JSONUtil.toList(data, Category.class);
    }

    //用的是内部类
    static class Category implements Serializable {

        private String category_id;
        private String category_name;
        private String category_url;

        //省略get set 方法
    }

复制代码

拉取所有作者

public static void run() {
    System.out.println("拉取分类");
    List<Category> categoryList = getAllCategory();
    while (true) {
        String now = LocalDateTime.now().format(dateFormat);
        HashMap<String, Author> authorHashMap = new HashMap<>();

        System.out.println("开始拉取数据：" + now);

//      获取所以作者
        for (Category category : categoryList) {
            try {
                List<Author> authorList = getAllAuthor(category);
                for (Author author : authorList) {
                    author.setTime(now);
                    authorHashMap.put(author.getUser_id(), author);
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }

        //保存数据 authorHashMap
        //这里使用的是追加到文件末尾，不然内存不够
        try {
            String path = "./j-" + LocalDate.now().format(dayFormat) + ".json";
            initFile(path);
            FileWriter fw = new FileWriter(path, true);
            PrintWriter pw = new PrintWriter(fw);
            pw.println(JSONUtil.toJsonStr(MapUtil.of(now, authorHashMap.values())));  //字符串末尾不需要换行符
            pw.close();
            fw.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
           
        //等待一会继续拉取
        System.out.println("拉取数据结束：" + now);
        try {
            Thread.sleep(pullTime * 1000);
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}

//获取所有作者
private static String getUrl(String categoryId) {
    return "https://api.juejin.cn/user_api/v1/author/recommend?category_id=" + categoryId + "&cursor=0&limit=100";
}
private static List<Author> getAllAuthor(Category category) {
    try {
        String res = Http.get(getUrl(category.getCategory_id()));
        JSONArray data = JSONUtil.parseObj(res).getJSONArray("data");
        return JSONUtil.toList(data, Author.class);
    } catch (Exception e) {
        e.printStackTrace();
    }
    return Collections.emptyList();
}

static class Author implements Serializable {
    private String user_id;
    private String user_name;
    private String got_digg_count;
    private String got_view_count;
    private String avatar_large;
    private String company;
    private String job_title;
    private String level;
    private String description;
    private String author_desc;
    private String time;

    //get set 。。。。。。
}

复制代码

到这里拉数据就结束了

分析数据

这里分析使用的是scala，java太鸡肋了，不方便

一天数据大概是 60M，秒级出结果

第一步先读取数据文件

    val map: mutable.Map[String, List[Author]] = mutable.ListMap()

    def load(): Unit = {
        val lineList = new util.ArrayList[String]()
        IoUtil.readLines(new FileInputStream("./j-20210630.json"), StandardCharsets.UTF_8, lineList)
        lineList.forEach(line => {
            val type1: Type = new TypeReference[util.Map[String, util.List[Author]]] {}.getType
            val bean: util.Map[String, util.List[Author]] = JSONUtil.toBean(line, type1, true)
            bean.asScala.foreach(entry => map.put(entry._1, entry._2.asScala.toList))
        })
    }
复制代码

更具需求分析数据

    // 有很多地方可以优化，懒得搞，反正数据很小
    def main(args: Array[String]): Unit = {
        // 加载数据 就是上面的方法
        load()
        //1.获取所有value（每个时间段，所有作者）
        //2.扁平化
        //3.更具用户分组
        //4.每个用户的所以数据，更具时间排序
        //5.计算数据
        val map1 = map.values.flatten.groupBy(_.getUser_id).map(m => {
            (m._1, m._2.toList.sortBy(_.getTime))
        }).map(m => {
            val value: List[Author] = m._2
            //求出 总点赞数和总浏览数
            val day_got_digg_count = value.last.getGot_digg_count.toInt - value.head.getGot_digg_count.toInt
            val day_got_view_count = value.last.getGot_view_count.toInt - value.head.getGot_view_count.toInt
            //求出 获赞和获浏览数最多的 时间段
            var max_got_digg_count = 0;
            var max_got_digg_count_time = ""
            value.sliding(2, 2).foreach(l => {
                val head = l.head
                val last = l.last
                val value1 = last.getGot_digg_count.toInt - head.getGot_digg_count.toInt
                if (value1 > max_got_digg_count) {
                    max_got_digg_count = value1
                    max_got_digg_count_time = s"${getOutTime(head.getTime)} - ${getOutTime(last.getTime)}"
                }
            })
            var max_got_view_count = 0
            var max_got_view_count_time = ""
            value.sliding(2, 2).foreach(l => {
                val head = l.head
                val last = l.last
                val value1 = last.getGot_view_count.toInt - head.getGot_view_count.toInt
                if (value1 > max_got_view_count) {
                    max_got_view_count = value1
                    max_got_view_count_time = s"${getOutTime(head.getTime)} - ${getOutTime(last.getTime)}"
                }
            })
            //包装结果
            val head = value.head
            (m._1, Map(
                "user_name" -> head.getUser_name,
                "user_id" -> head.getUser_id,
                "day_got_digg_count" -> day_got_digg_count,
                "day_got_view_count" -> day_got_view_count,
                "max_got_digg_count" -> max_got_digg_count,
                "max_got_digg_count_time" -> max_got_digg_count_time,
                "max_got_view_count" -> max_got_view_count,
                "max_got_view_count_time" -> max_got_view_count_time,
            ))
        })
        
        //这里是，拿到所有结果，按需求倒排，然后取出 Top10
        
        println("\n-----------------当天总获赞Top10------------------")
        printf("|%-12s\t|%-5s|\n", "用户", "总获赞")
        printf("|%-12s\t|%-5s|\n", "-" * 12, "-" * 5)
        map1.values.toList.sortBy(value => value("day_got_digg_count").asInstanceOf[Int])(Ordering.Int.reverse).take(10).foreach(value => {
            printf("|%-12s\t|%-5s|\n", value("user_name"), value("day_got_digg_count"))
        })

        println("\n-----------------当天总浏览Top10------------------")
        printf("|%-12s\t|%-5s|\n", "用户", "总浏览")
        printf("|%-12s\t|%-5s|\n", "-" * 12, "-" * 5)
        map1.values.toList.sortBy(value => value("day_got_view_count").asInstanceOf[Int])(Ordering.Int.reverse).take(10).foreach(value => {
            printf("|%-12s\t|%-5s|\n", value("user_name"), value("day_got_view_count"))
        })

        println("\n-----------------当天单时间段获赞Top10------------------")
        printf("|%-12s\t|%-25s\t|%-5s|\n", "用户", "时间段", "获赞")
        printf("|%-12s\t|%-25s\t|%-5s|\n", "-" * 12, "-" * 25, "-" * 5)
        map1.values.toList.sortBy(value => value("max_got_digg_count").asInstanceOf[Int])(Ordering.Int.reverse).take(10).foreach(value => {
            printf("|%-12s\t|%-25s\t|%-5s|\n", value("user_name"), value("max_got_digg_count_time"), value("max_got_digg_count"))
        })

        println("\n-----------------当天单时间段浏览Top10------------------")
        printf("|%-12s\t|%-25s\t|%-5s|\n", "用户", "时间段", "获浏览")
        printf("|%-12s\t|%-25s\t|%-5s|\n", "-" * 12, "-" * 25, "-" * 5)
        map1.values.toList.sortBy(value => value("max_got_view_count").asInstanceOf[Int])(Ordering.Int.reverse).take(10).foreach(value => {
            printf("|%-12s\t|%-25s\t|%-5s|\n", value("user_name"), value("max_got_view_count_time"), value("max_got_view_count"))
        })
    }
复制代码