1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
|
username = "username" password = "password" dbname = "dbname"
import requests from bs4 import BeautifulSoup import re import datetime import pymysql
conn = pymysql.connect(host = "127.0.0.1", user = username, password = password, db = dbname, charset = "utf8") print(conn) cur = conn.cursor() cur.execute("SET NAMES utf8mb4")
def savedb(data): print(data) try: cur.execute("insert into event values(null,%s,%s,%s,%s)", data) except pymysql.err.InternalError: print("\033[31mERROR: Incorrect string value.\033[0m", data) with open("failed.txt", "a") as myfile: myfile.write(str(data) + "\n") except pymysql.err.DataError: print("\033[31mERROR: Data too long.\033[0m", data) with open("failed.txt", "a") as myfile: myfile.write(str(data) + "\n")
def getDateList(): list = [] date = datetime.date(2016, 1, 1) for i in range(366): date_str = str(date.month) + "月" + str(date.day) + "日" list.append(date_str) date += datetime.timedelta(days = 1) return list
def getInfo(html, type, date): typeList = ["大事记", "出生", "逝世"] flag = re.compile("(<h2><span id=.*<span class=\"mw-headline\" id=.*?" + typeList[type] + "[\s\S]*?</ul>\s*?)<h2>").search(html) if flag: bsObj = BeautifulSoup(flag.group(1), "html.parser").findAll("li") for li in bsObj: match = re.compile("((^前|^)\d{1,4}年):([\s\S]*$)").match(li.get_text()) if match: year = match.group(1) info = re.sub("\[\d{1,}\]", "", match.group(3).strip()) data = (type, year, date, info) savedb(data)
list = getDateList() for date in list: print(date) url = "https://zh.wikipedia.org/zh-cn/%s" % date r = requests.get(url) getInfo(r.text, 0, date) getInfo(r.text, 1, date) getInfo(r.text, 2, date)
cur.connection.commit() cur.close() conn.close()
|