spider-scheduler/pkg/task/search.go

142 lines
4.2 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

package task
import (
"github.com/eigeen/furryboard/spider-scheduler/pkg/conf"
"github.com/eigeen/furryboard/spider-scheduler/pkg/dao/model"
"github.com/eigeen/furryboard/spider-scheduler/pkg/log"
"github.com/eigeen/furryboard/spider-scheduler/pkg/logic"
"time"
)
// SearchUpsFromVideo
// 通过关键词进行视频搜索的方式获取用户
// 新用户即写入用户表状态为StatusPending
//
// 策略每10分钟抓取前3页每页42条视频若抓取的内容中存在
// 一条视频与上次抓取内容的最后一条视频相同,则判断为已到达结尾
func SearchUpsFromVideo() {
// 获取最后一次抓取的记录
meta, err := logic.GetMetadata("SearchVideoCheckpoint")
if err != nil {
log.Logger().Warnf("获取上一次视频抓取记录失败:%s", err)
err = nil
}
var history = ""
if meta != nil {
history = meta.Value
}
var newHistory = ""
// 逐页获取判断
for page := 1; page <= 3; page++ {
videos, err := logic.SearchNewestVideos("furry", conf.Conf.SpiderCore.BiliCookie, 1)
if err != nil {
log.Logger().Errorf("搜索视频失败:%s", err)
return
}
if len(videos) == 0 {
log.Logger().Info("视频搜索已达末页")
break
}
// 第一次获取将第一个视频bv号作为newHistory
if page == 1 {
newHistory = videos[0].Bvid
}
// 从videos列表抽取视频bv号 map索引
var bvids = make(map[string]int, 0)
for i, video := range videos {
bvids[video.Bvid] = i
}
// 先判断是否到达上一次结尾
var historyPos = 42
if i, ok := bvids[history]; ok {
historyPos = i
}
// 防越界
if len(videos) < historyPos {
historyPos = len(videos)
}
// 先判断再抽取uid列表节省数据库资源
// 从videos列表抽取uid(mid) map去重
var uids = make(map[uint64]*model.Furry, 0)
for i := 0; i < historyPos; i++ {
uids[videos[i].Mid] = &model.Furry{
UID: uint(videos[i].Mid),
Name: videos[i].Author,
Status: model.StatusPendingVideo,
}
}
// 逐个判断用户是否存在于数据库,不存在则新增
for uid, user := range uids {
exists, err := logic.IsUserExistsByUID(uint(uid))
if err != nil {
log.Logger().Warnf("判断用户存在性时发生错误:%s", err)
continue
}
if !exists {
err = logic.AddUser(user)
if err != nil {
log.Logger().Warnf("新增用户[%d]时失败:%s", uid, err)
continue
}
log.Logger().Infof("通过视频搜索新增用户:%s(%d)", user.Name, uid)
}
}
// 已到达上一次结尾,中断循环,否则继续获取下一页
if historyPos != 42 {
break
}
time.Sleep(1 * time.Second)
}
// 更新history
err = logic.UpdateVideoCheckpoint(newHistory)
if err != nil {
log.Logger().Warnf("更新Metadata: SearchVideoCheckpoint时发生错误%s", err)
}
}
func SearchUpsFromLiveRoom() {
// TODO: 多关键词可复用
// 先获取页数,后逐页获取
pageNum, err := logic.GetLiveRoomPageNum("furry", conf.Conf.SpiderCore.BiliCookie)
if err != nil {
log.Logger().Errorf("搜索直播间错误:%s", err)
return
}
for page := 1; page <= pageNum; page++ {
rooms, err := logic.SearchLiveRooms("furry", conf.Conf.SpiderCore.BiliCookie, uint(page))
if err != nil {
log.Logger().Errorf("搜索直播间错误:%s", err)
return
}
count, err := logic.UpdateLiveRoomInfo(rooms)
if err != nil {
log.Logger().Errorf("更新直播间信息错误:%s", err)
return
}
log.Logger().Infof("更新了%d个直播间信息", count)
time.Sleep(1 * time.Second)
}
// 先获取页数,后逐页获取
pageNum, err = logic.GetLiveRoomPageNum("冬聚", conf.Conf.SpiderCore.BiliCookie)
if err != nil {
log.Logger().Errorf("搜索直播间错误:%s", err)
return
}
for page := 1; page <= pageNum; page++ {
rooms, err := logic.SearchLiveRooms("冬聚", conf.Conf.SpiderCore.BiliCookie, uint(page))
if err != nil {
log.Logger().Errorf("搜索直播间错误:%s", err)
return
}
count, err := logic.UpdateLiveRoomInfo(rooms)
if err != nil {
log.Logger().Errorf("更新直播间信息错误:%s", err)
return
}
log.Logger().Infof("更新了%d个直播间信息", count)
time.Sleep(1 * time.Second)
}
}