nonebot_plugin_csust_notice.../page_parser.py

67 lines
1.9 KiB
Python

from lxml import etree
import re
from datetime import date
try:
import ujson as json
except:
import json
try:
from .notice import Notice
except ImportError:
from notice import Notice
class PageParser(object):
"""解析器类
"""
REG = re.compile(r'info/\d+/(\d+).htm', re.S)
def __init__(self, page: str) -> None:
self._page_content = page
self._notices = []
self._parse()
def _parse(self):
"""解析页面
"""
root = etree.HTML(self._page_content)
elements = root.xpath("//ul[@class='secList pageList']/li")
for e in elements:
notice = Notice()
notice.title = e.xpath("div[@class='r']//a/text()")[0]
url = str(e.xpath("div[@class='r']//a/@href")[0])
notice.url = url
notice.id = self._get_id(url)
content_review = e.xpath(
"div[@class='r']/div[@class='intro']/text()")[0]
content_review = content_review.replace('\\n', '').strip()
notice.content_preview = content_review
year, month = e.xpath("div[@class='l']/text()")[0].split('-')
day = e.xpath("div[@class='l']/span/text()")[0]
notice.publish_date = date(
year=int(year), month=int(month), day=int(day))
self._notices.append(notice)
def _get_id(self, url: str) -> int:
res = self.REG.findall(url)[0]
return int(res)
def get_notices(self) -> list[Notice]:
return self._notices
# def _parse_date(self, date_str: str) -> date:
# pass
# def test(self):
# root = etree.HTML(self._page_content)
# elements = root.xpath("//ul[@class='secList pageList']/li")
# e = elements[0]
# print(type(e.xpath("div[@class='r']//a/@href")[0]))
if __name__ == "__main__":
with open('sample.html', encoding='utf-8') as fp:
parser = PageParser(fp.read())
parser.test()