From d61a40e7c086e9a38c5d531b8aa25e732f0c98e9 Mon Sep 17 00:00:00 2001 From: Eigeen Date: Sat, 25 Mar 2023 11:21:43 +0800 Subject: [PATCH] =?UTF-8?q?:tada:=E5=88=9D=E7=89=88=E6=94=AF=E6=8C=81?= =?UTF-8?q?=E8=87=AA=E5=8A=A8=E6=8E=A8=E9=80=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 138 ++++++++++ LICENSE | 21 ++ __init__.py | 138 ++++++++++ config.py | 6 + db.py | 69 +++++ entity_generator.py | 58 ++++ notice.py | 90 ++++++ page_parser.py | 66 +++++ sample.html | 648 ++++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 1234 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 __init__.py create mode 100644 config.py create mode 100644 db.py create mode 100644 entity_generator.py create mode 100644 notice.py create mode 100644 page_parser.py create mode 100644 sample.html diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a81c8ee --- /dev/null +++ b/.gitignore @@ -0,0 +1,138 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6dc177d --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 Eigeen + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..1dbf21c --- /dev/null +++ b/__init__.py @@ -0,0 +1,138 @@ +from .config import Config +from .page_parser import PageParser +from .db import CachedNotice +from .notice import Notice +import os +from pathlib import Path +import aiohttp +from nonebot_plugin_apscheduler import scheduler +import nonebot +from nonebot import on_command, require +from nonebot.log import logger +from nonebot.rule import Rule +from nonebot.adapters.onebot.v11.adapter import Message, MessageSegment +from nonebot.adapters.onebot.v11 import Message, MessageSegment, Bot, MessageEvent +from nonebot.matcher import Matcher +from nonebot.params import CommandArg + +require("nonebot_plugin_apscheduler") + + +notice_pusher = on_command("csust_notice_pusher", rule=Rule(), priority=5) +PACKAGE_PATH = os.path.dirname(os.path.abspath(__file__)) +DATA_PATH = Path().absolute() / "data" / "notice_pusher" +DB_PATH = DATA_PATH / "cache.db" +GLOBAL_CFG = nonebot.get_driver().config +CFG = Config.parse_obj(GLOBAL_CFG.dict()) + +XGDT_URL = 'https://www.csust.edu.cn/gjxy/xgdt.htm' +XFZX_URL = 'https://www.csust.edu.cn/gjxy/xxyfzzx.htm' +URL_PREFIX = 'https://www.csust.edu.cn/gjxy/' +HEADERS = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36 Edg/110.0.1587.50', +} + +logger.info('加载数据库文件: ' + str(DB_PATH)) +db = CachedNotice(DB_PATH) + + +async def check_and_push(): + msg = await auto_check() + await push_all(msg) + + +async def auto_check() -> list[str]: + msgs = [] + xgdt_notices = await get_notices(XGDT_URL) + xfzx_notices = await get_notices(XFZX_URL) + xgdt_updates = [] + xfzx_updates = [] + + # 学工动态 + for i, n in enumerate(xgdt_notices): + if db.is_notice_exists(n.id): + continue + full_url = URL_PREFIX + n.url + msg = f'{i+1}. \n标题:{n.title}\n内容预览:{n.content_preview[:40]}\n链接:{full_url}\n\n' + xgdt_updates.append(msg) + db.add_notice(Notice(*n.get_properties())) + logger.info("学工动态更新: " + msg) + # 学发中心 + for i, n in enumerate(xfzx_notices): + if db.is_notice_exists(n.id): + continue + full_url = URL_PREFIX + n.url + msg = f'{i+1}. \n标题:{n.title}\n内容预览:{n.content_preview[:40]}\n链接:{full_url}\n\n' + xfzx_updates.append(msg) + db.add_notice(Notice(*n.get_properties())) + logger.info("学发中心更新: " + msg) + + if xgdt_updates: + msgs.append('学工动态更新:\n') + msgs += xgdt_updates + + if xfzx_updates: + msgs.append('学发中心更新:\n') + msgs += xfzx_updates + + return msgs + + +async def get_notices(url: str): + page_content = await get_page(url) + if not page_content: + return + + parser = PageParser(page_content) + notices = parser.get_notices() + + return notices + + +async def get_page(url: str) -> str: + """获取页面 + + Raises: + IOError: 页面获取失败 + """ + async with aiohttp.ClientSession() as session: + async with session.get(url=url, headers=HEADERS) as res: + if res.status != 200: + raise IOError('页面获取失败: Code ' + res.status) + return await res.text() + + +async def push_all(msg: str): + for gid in CFG.notice_pusher_enable: + await push_to_group(gid, msg) + + +async def push_to_group(groupid: int, msg: str): + bot = nonebot.get_bot() + await bot.send_group_msg(group_id=groupid, message=Message(msg)) + + +async def push_error_to_admins(user_id: list[int], msg: str): + bot = nonebot.get_bot() + for u in user_id: + await bot.send_private_msg(user_id=user_id, message=Message(msg)) + +scheduler.add_job( + check_and_push, + 'cron', + hour=9, + minute=0, + second=0, + id='check_notice_and_push' +) + + +@notice_pusher.handle() +async def handle_pusher(matcher: Matcher, event: MessageEvent, arg: Message = CommandArg()): + try: + msg = await auto_check() + except IOError as e: + await push_error_to_admins(GLOBAL_CFG.superusers, str(e)) + return + + await matcher.finish(msg) diff --git a/config.py b/config.py new file mode 100644 index 0000000..6dadc49 --- /dev/null +++ b/config.py @@ -0,0 +1,6 @@ +from typing import List +from pydantic import BaseModel, Extra + + +class Config(BaseModel, extra=Extra.ignore): + notice_pusher_enable: List = [] \ No newline at end of file diff --git a/db.py b/db.py new file mode 100644 index 0000000..da6d2d1 --- /dev/null +++ b/db.py @@ -0,0 +1,69 @@ +import sqlite3 +import os +from .notice import Notice +from typing import Any +from nonebot.log import logger + + +class CachedNotice(object): + def __init__(self, db_path: str) -> None: + self.engine = None + self.db_path = db_path + self.connect() + self.init_table() + + def __del__(self): + self.engine.close() + + def connect(self): + self.engine = sqlite3.connect(self.db_path) + + def init_table(self): + cur = self.engine.cursor() + cur.execute("""CREATE TABLE IF NOT EXISTS notices ( + id BIGINT PRIMARY KEY, + url TEXT NOT NULL DEFAULT '', + title TEXT NOT NULL DEFAULT '', + content_preview TEXT NOT NULL DEFAULT '', + publish_date DATE NOT NULL DEFAULT '', + content TEXT NOT NULL DEFAULT '', + source TEXT NOT NULL DEFAULT '' +);""") + self.engine.commit() + cur.close() + + def get_notice_by_id(self, id: int) -> Notice: + cur = self.engine.cursor() + cur.execute( + f"""SELECT id, url, title, content_preview, publish_date, content, source FROM notices WHERE id = {id}""") + data = cur.fetchone() + cur.close() + + if data: + return Notice(*data) + else: + return None + + def is_notice_exists(self, notice_id: int) -> bool: + res = self.get_notice_by_id(notice_id) + if res: + return True + else: + return False + + def add_notice(self, data: Notice): + if self.is_notice_exists(data.id): + return + + cur = self.engine.cursor() + sql = """INSERT INTO notices (id, url, title, content_preview, publish_date, content, source) +VALUES ({}, '{}', '{}', '{}', '{}', '{}', '{}');""".format(*map(quote_escape, data.get_properties())) + cur.execute(sql) + self.engine.commit() + cur.close() + + +def quote_escape(x: str | Any): + if isinstance(x, str): + x = x.replace("'", "''") + return x diff --git a/entity_generator.py b/entity_generator.py new file mode 100644 index 0000000..fa3d600 --- /dev/null +++ b/entity_generator.py @@ -0,0 +1,58 @@ +def main(pstr: str, defaults: dict): + properties = pstr.rsplit(',') + result = '' + + arg_init_head = """ def __init__(self, {}):\n""" + arg_init_line = """ self.__{0} = {0}\n""" + noarg_init_head = """ def __init__(self):\n""" + noarg_init_line = """ self.__{0} = {1}\n""" + getter = """ @property + def {0}(self): + return self.__{0} +""" + setter = """ @{0}.setter + def {0}(self, value): + if value == None: + self.__{0} = '' + self.__{0} = value +""" + + # 有参构造器 + result += arg_init_head.format(', '.join(properties)) + for p in properties: + result += arg_init_line.format(p) + + # 无参构造器 + result += '\n' + result += noarg_init_head + for p in properties: + if p in defaults: + default = str(defaults[p]) + else: + default = "''" + result += noarg_init_line.format(p, default) + + # property + for p in properties: + result += '\n' + result += getter.format(p) + result += '\n' + result += setter.format(p) + + # get_properties() + result += '\n' + result += ' def get_properties(self) -> list:\n' + _ret = list(map(lambda x: 'self.' + x, properties)) + _ret = ', '.join(_ret) + result += ' return {}'.format(_ret) + + # tostring() + + + print(result) + + +if __name__ == '__main__': + properties = 'id,url,title,content_review,publish_date,content,source' + defaults = {'publish_date': None, 'id': 0} + main(properties, defaults) diff --git a/notice.py b/notice.py new file mode 100644 index 0000000..139bb1f --- /dev/null +++ b/notice.py @@ -0,0 +1,90 @@ +from datetime import date, datetime +from typing import Any + + +class Notice(object): + """通知实体类 + """ + + def __init__(self, id=0, url='', title='', content_preview='', publish_date: date=None, content='', source=''): + self.__id = id + self.__url = url + self.__title = title + self.__content_preview = content_preview + self.publish_date = publish_date + self.__content = content + self.__source = source + + @property + def id(self): + return self.__id + + @id.setter + def id(self, value): + if value == None: + self.__id = 0 + self.__id = value + + @property + def url(self): + return self.__url + + @url.setter + def url(self, value): + if value == None: + self.__url = '' + self.__url = value + + @property + def title(self): + return self.__title + + @title.setter + def title(self, value): + if value == None: + self.__title = '' + self.__title = value + + @property + def content_preview(self): + return self.__content_preview + + @content_preview.setter + def content_preview(self, value): + if value == None: + value = '' + self.__content_preview = value + + @property + def publish_date(self): + return self.__publish_date.strftime("%Y-%m-%d") + + @publish_date.setter + def publish_date(self, value: date): + if isinstance(value, str): + dt = datetime.strptime(value, '%Y-%m-%d') + value = dt.date() + self.__publish_date = value + + @property + def content(self): + return self.__content + + @content.setter + def content(self, value): + if value == None: + self.__content = '' + self.__content = value + + @property + def source(self): + return self.__source + + @source.setter + def source(self, value): + if value == None: + self.__source = '' + self.__source = value + + def get_properties(self) -> list: + return self.id, self.url, self.title, self.content_preview, self.publish_date, self.content, self.source diff --git a/page_parser.py b/page_parser.py new file mode 100644 index 0000000..586c7f9 --- /dev/null +++ b/page_parser.py @@ -0,0 +1,66 @@ +from lxml import etree +import re +from datetime import date +try: + import ujson as json +except: + import json + +try: + from .notice import Notice +except ImportError: + from notice import Notice + + +class PageParser(object): + """解析器类 + """ + REG = re.compile(r'info/\d+/(\d+).htm', re.S) + + def __init__(self, page: str) -> None: + self._page_content = page + self._notices = [] + self._parse() + + def _parse(self): + """解析页面 + """ + root = etree.HTML(self._page_content) + elements = root.xpath("//ul[@class='secList pageList']/li") + for e in elements: + notice = Notice() + notice.title = e.xpath("div[@class='r']//a/text()")[0] + url = str(e.xpath("div[@class='r']//a/@href")[0]) + notice.url = url + notice.id = self._get_id(url) + content_review = e.xpath( + "div[@class='r']/div[@class='intro']/text()")[0] + content_review = content_review.replace('\\n', '').strip() + notice.content_preview = content_review + year, month = e.xpath("div[@class='l']/text()")[0].split('-') + day = e.xpath("div[@class='l']/span/text()")[0] + notice.publish_date = date( + year=int(year), month=int(month), day=int(day)) + self._notices.append(notice) + + def _get_id(self, url: str) -> int: + res = self.REG.findall(url)[0] + return int(res) + + def get_notices(self) -> list[Notice]: + return self._notices + + # def _parse_date(self, date_str: str) -> date: + # pass + + # def test(self): + # root = etree.HTML(self._page_content) + # elements = root.xpath("//ul[@class='secList pageList']/li") + # e = elements[0] + # print(type(e.xpath("div[@class='r']//a/@href")[0])) + + +if __name__ == "__main__": + with open('sample.html', encoding='utf-8') as fp: + parser = PageParser(fp.read()) + parser.test() diff --git a/sample.html b/sample.html new file mode 100644 index 0000000..c91f660 --- /dev/null +++ b/sample.html @@ -0,0 +1,648 @@ + + + + + + 学工动态-国际工学院 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+
+ + + +
+
+ + + +
+
+ +
+ +
+
+ +
+
+
+
+

+ + + + 学工动态

+
+
+
+
+ +
+
+
+

+ + + + 学工动态

+
+
+ + + + + +
+
+ + + + + +
+ + + + + +
+ 共344条  1/29  +
首页上页  
+
+
+
+ +
+
+
+
+ +
+
+ + +
+ + + + + + + + \ No newline at end of file