package task import ( "github.com/furryboard/spider-scheduler/pkg/conf" "github.com/furryboard/spider-scheduler/pkg/dao/model" "github.com/furryboard/spider-scheduler/pkg/log" "github.com/furryboard/spider-scheduler/pkg/logic" "time" ) // SearchUpsFromVideo // 通过关键词进行视频搜索的方式获取用户 // 新用户即写入用户表,状态为StatusPending // // 策略:每10分钟抓取前3页(每页42条)视频,若抓取的内容中存在 // 一条视频与上次抓取内容的最后一条视频相同,则判断为已到达结尾 func SearchUpsFromVideo() { // 获取最后一次抓取的记录 meta, err := logic.GetMetadata("SearchVideoCheckpoint") if err != nil { log.Logger().Warnf("获取上一次视频抓取记录失败:%s", err) err = nil } var history = "" if meta != nil { history = meta.Value } var newHistory = "" // 逐页获取判断 for page := 1; page <= 3; page++ { videos, err := logic.SearchNewestVideos("furry", conf.Conf.SpiderCore.BiliCookie, 1) if err != nil { log.Logger().Errorf("搜索视频失败:%s", err) return } if len(videos) == 0 { log.Logger().Info("视频搜索已达末页") break } // 第一次获取,将第一个视频bv号作为newHistory if page == 1 { newHistory = videos[0].Bvid } // 从videos列表抽取视频bv号 map索引 var bvids = make(map[string]int, 0) for i, video := range videos { bvids[video.Bvid] = i } // 先判断是否到达上一次结尾 var historyPos = 42 if i, ok := bvids[history]; ok { historyPos = i } // 防越界 if len(videos) < historyPos { historyPos = len(videos) } // 先判断再抽取uid列表,节省数据库资源 // 从videos列表抽取uid(mid) map去重 var uids = make(map[uint64]*model.Furry, 0) for i := 0; i < historyPos; i++ { uids[videos[i].Mid] = &model.Furry{ UID: uint(videos[i].Mid), Name: videos[i].Author, Status: model.StatusPendingVideo, } } // 逐个判断用户是否存在于数据库,不存在则新增 for uid, user := range uids { exists, err := logic.IsUserExistsByUID(uint(uid)) if err != nil { log.Logger().Warnf("判断用户存在性时发生错误:%s", err) continue } if !exists { err = logic.AddUser(user) if err != nil { log.Logger().Warnf("新增用户[%d]时失败:%s", uid, err) continue } log.Logger().Infof("通过视频搜索新增用户:%s(%d)", user.Name, uid) } } // 已到达上一次结尾,中断循环,否则继续获取下一页 if historyPos != 42 { break } time.Sleep(1 * time.Second) } // 更新history err = logic.UpdateVideoCheckpoint(newHistory) if err != nil { log.Logger().Warnf("更新Metadata: SearchVideoCheckpoint时发生错误:%s", err) } } func SearchUpsFromLiveRoom() { // TODO: 多关键词可复用 // 先获取页数,后逐页获取 pageNum, err := logic.GetLiveRoomPageNum("furry", conf.Conf.SpiderCore.BiliCookie) if err != nil { log.Logger().Errorf("搜索直播间错误:%s", err) return } for page := 1; page <= pageNum; page++ { rooms, err := logic.SearchLiveRooms("furry", conf.Conf.SpiderCore.BiliCookie, uint(page)) if err != nil { log.Logger().Errorf("搜索直播间错误:%s", err) return } count, err := logic.UpdateLiveRoomInfo(rooms) if err != nil { log.Logger().Errorf("更新直播间信息错误:%s", err) return } log.Logger().Infof("更新了%d个直播间信息", count) time.Sleep(1 * time.Second) } // 先获取页数,后逐页获取 pageNum, err = logic.GetLiveRoomPageNum("冬聚", conf.Conf.SpiderCore.BiliCookie) if err != nil { log.Logger().Errorf("搜索直播间错误:%s", err) return } for page := 1; page <= pageNum; page++ { rooms, err := logic.SearchLiveRooms("冬聚", conf.Conf.SpiderCore.BiliCookie, uint(page)) if err != nil { log.Logger().Errorf("搜索直播间错误:%s", err) return } count, err := logic.UpdateLiveRoomInfo(rooms) if err != nil { log.Logger().Errorf("更新直播间信息错误:%s", err) return } log.Logger().Infof("更新了%d个直播间信息", count) time.Sleep(1 * time.Second) } }