From a941ee38dac5af9f5b3adac576e1219054bff751 Mon Sep 17 00:00:00 2001 From: Fabian Gerle Date: Sun, 8 Oct 2023 21:53:21 +0200 Subject: [PATCH] some fixes and add ancestor methods to MathDB class --- MathGen.py | 292 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 244 insertions(+), 48 deletions(-) diff --git a/MathGen.py b/MathGen.py index c1b5611..8378e6f 100644 --- a/MathGen.py +++ b/MathGen.py @@ -5,6 +5,11 @@ import time from igraph import Graph import sqlite3 from sqlite3 import Error +import json +import re +import logging +#logging.warning('Watch out!') # will print a message to the console +#logging.info('I told you so') # will not print anything class mathPage: @@ -21,15 +26,17 @@ class mathPage: self.year = None self.diss = None self.advisorID = [None, None, None] + self.students = "{}" def get_entry(self): - return((self.id, self.name, self.title, self.inst, self.year, self.diss, self.advisorID[0], self.advisorID[1], self.advisorID[2])) + return((self.id, self.name, self.title, self.inst, self.year, self.diss, self.advisorID[0], self.advisorID[1], self.advisorID[2], self.students)) def get_info(self): + logging.info(f'Downloading info to mathID {self.id} from the online database.') temp = requests.get(self.url) temp.encoding = "utf-8" self.page = temp.text @@ -138,7 +145,30 @@ class mathPage: studentsYear.append(s) self.studentsYear = studentsYear + i = 1 + while True: + try: + descText = self.tree.xpath(f'//*[@id="paddingWrapper"]/p[{i}]/text()') + except IndexError: + break + if ("According to our current" in descText[0]) or ("No students known" in descText[0]): + break + i += 1 + if ("No students known" in descText[0]): + self.students = json.dumps({}) + else: + try: + numStudents = int(''.join(filter(str.isdigit, descText[0]))) + except ValueError: + numStudents = 0 + try: + numDescendants = int(''.join(filter(str.isdigit, descText[1]))) + except ValueError: + numDescendants = 0 + self.students = json.dumps({"MathID": studentsID, "Institute": studentsInst, "Year": studentsYear, "Students": numStudents, "Descendants": numDescendants}) + + class mathDB: @@ -147,6 +177,15 @@ class mathDB: self.db_file = db_file self.init_db() + + def get_cursor(self, connection=None): + if connection == None: + conn = self.create_connection() + else: + conn = connection + cur = conn.cursor() + return(cur, conn) + def create_connection(self): @@ -188,7 +227,8 @@ class mathDB: thesis text, first_advisor integer, second_advisor integer, - third_advisor integer + third_advisor integer, + students text );""" if conn is not None: self.create_table(conn, sql_create_main_table) @@ -197,16 +237,30 @@ class mathDB: print("Cannot create database connection") + def add_person(self, mathID, connection=None): + if not self.exists(mathID): + try: + p = mathPage(mathID) + p.get_info() + self.insert_person(p, connection=connection) + except ValueError: + print(f"Error: No entry with mathID {mathID} exists.") + + def insert_person(self, mathPage, connection=None): + entry = mathPage.get_entry() + if entry[1] == None: + raise ValueError("No such entry in the math genealogy project found.") + return if connection == None: conn = self.create_connection() else: conn = connection cur = conn.cursor() row = self.get_person(mathPage.id, conn) - if len(row) == 0: - cur.execute("INSERT INTO mathematicians VALUES (?,?,?,?,?,?,?,?,?)", mathPage.get_entry()) + if row[0] == 0: + cur.execute("INSERT INTO mathematicians VALUES (?,?,?,?,?,?,?,?,?,?)", entry) else: cur.execute(""" UPDATE mathematicians @@ -217,21 +271,61 @@ class mathDB: thesis = ?, first_advisor = ?, second_advisor = ?, - third_advisor = ? + third_advisor = ?, + students = ? WHERE id = ? - """, (mathPage.name, mathPage.title, mathPage.inst, mathPage.year, mathPage.diss, mathPage.advisorID[0], mathPage.advisorID[1],mathPage.advisorID[2], mathPage.id)) + """, (mathPage.name, mathPage.title, mathPage.inst, mathPage.year, mathPage.diss, mathPage.advisorID[0], mathPage.advisorID[1],mathPage.advisorID[2], mathPage.students, mathPage.id)) conn.commit() if connection == None: conn.close() - - - def find_missing(self, limit): + + def get_students(self, mathID, connection=None): +# conn = sqlite3.connect(self.db_file) + cur, conn = self.get_cursor(connection) + students = [] + if not self.exists(mathID): + try: + self.add_person(mathID, connection=connection) + except ValueError: + print(f"Error: No entry with mathID {mathID} exists.") + return(students) + cur.execute(""" + SELECT id + FROM mathematicians + WHERE first_advisor = ? + OR second_advisor = ? + OR third_advisor = ? + """, (mathID,mathID,mathID),) + temp = cur.fetchall() + for s in temp: + students.append(s[0]) + if connection == None: + conn.close() + return(students) + + def get_students_entry(self, mathID, connection=None): + cur, conn = self.get_cursor(connection) + if not self.exists(mathID): + try: + self.add_person(mathID, connection) + except ValueError: + print(f"Error: No entry with mathID {mathID} exists.") + p = self.get_person(mathID, connection=conn) + students = json.loads(p[9]) + return(students["MathID"]) + if connection == None: + conn.close() + + + + + def find_missing(self, limit, connection=None): """ Find missing entries in database. """ - conn = sqlite3.connect(self.db_file) - cur = conn.cursor() +# conn = sqlite3.connect(self.db_file) + cur,conn = self.get_cursor(connection) missing = [] if type(limit) == int: limit = range(1, limit+1) @@ -250,17 +344,18 @@ class mathDB: return(missing) if p == 0: missing.append(i) - conn.close() + if connection == None: + conn.close() return(missing) - def exists(self, id): - conn = sqlite3.connect(self.db_file) - cur = conn.cursor() - cur.execute("SELECT EXISTS(SELECT * FROM mathematicians WHERE id = ?)", (id,)) + def exists(self, mathID, connection=None): + cur,conn = self.get_cursor(connection) + cur.execute("SELECT EXISTS(SELECT * FROM mathematicians WHERE id = ?)", (mathID,),) p = cur.fetchall()[0][0] - conn.close() + if connection == None: + conn.close() if p == 1: return(True) else: @@ -276,34 +371,34 @@ class mathDB: def check_missing_data(self, id): - conn = sqlite3.connect(self.db_file) - cur = conn.cursor() - cur.execute("SELECT EXISTS(SELECT * FROM mathematicians WHERE id = ?)", (id,)) + cur,conn = self.get_cursor(connection) + cur.execute("SELECT EXISTS(SELECT * FROM mathematicians WHERE id = ?)", (id,),) p = cur.fetchall()[0][0] if p == 0: - conn.close() + if connection == None: + conn.close() return((1,1,1,1,1,1,1,1)) else: - cur.execute("SELECT * FROM mathematicians WHERE id = ?", (id,)) + cur.execute("SELECT * FROM mathematicians WHERE id = ?", (id,),) data = cur.fetchall()[0] - conn.close() + if connection == None: + conn.close() return(data) # print(data) def get_person(self, id, connection=None): - if connection == None: - conn = self.create_connection() - else: - conn = connection - conn = self.create_connection() - cur = conn.cursor() - res = cur.execute(f"SELECT * FROM mathematicians WHERE id = {id}") + cur,conn = self.get_cursor(connection) + res = cur.execute(f"SELECT * FROM mathematicians WHERE id = ?", (id,),) res = cur.fetchall() if connection == None: conn.close() - return res + if len(res) == 0: + return((0,0,0,0,0,0,0,0,0,0)) + else: + return(res[0]) + # def _makeLimitIterable(self, limit): # if type(limit) == int: @@ -315,6 +410,65 @@ class mathDB: # print("Wrong parameter. Limit must be an iterable object or an integer") # return(ret) + def get_ancestors(self, mathID, depth=0, connection=None): + cur,conn = self.get_cursor(connection) + i = 0 + anc = {mathID} + anc_new = {mathID} + if not self.exists(mathID): + self.add_person(mathID) + while True: + anc_temp = set() + if i > depth and not depth == 0: + break + for a in anc_new: + p = self.get_person(a, connection=conn) + for j in range(3): + if p[6+j] > 0: + anc_temp.add(p[6+j]) + anc = anc | anc_new + anc_new = anc_temp + if len(anc_new) == 0: + break + i += 1 + if connection == None: + conn.close() + return(anc) + + + + def add_ancestors(self, mathID, depth=0, connection=None): + cur,conn = self.get_cursor(connection) + i = 0 + anc = {mathID} + anc_new = {mathID} + if not self.exists(mathID): + self.add_person(mathID) + while True: + anc_temp = set() + if i > depth and not depth == 0: + break + for a in anc_new: + p = self.get_person(a, connection=connection) + for j in range(3): + if p[6+j] > 0: + anc_temp.add(p[6+j]) +# print(p[6+j]) + anc = anc | anc_new + anc_new = anc_temp + if len(anc_new) == 0: + break + for a in anc_new: + if not self.exists(a): + self.add_person(a, connection=connection) + i += 1 + if connection == None: + conn.close() + + def add_decendants(self, mathID, depth=0, connection=None): + pass + + def populate_db(self, limit, chunk = 10): conn = sqlite3.connect(self.db_file) @@ -335,14 +489,11 @@ class mathDB: thread = threading.Thread(target = persons[j].get_info) threads.append(thread) thread.start() - for j, thread in enumerate(threads): thread.join() - for j in range(chunk): print(f"Adding MathID {limit[i*chunk+j]} to database. Entry {i*chunk+j+1} of {l}.", end= '\r') self.insert_person(persons[j], conn) - threads = list() persons = list() for j in range(r): @@ -351,18 +502,14 @@ class mathDB: thread = threading.Thread(target = persons[j].get_info) threads.append(thread) thread.start() - for j, thread in enumerate(threads): - thread.join() - for j in range(r): print(f"Adding MathID {limit[n*chunk+j]} to database. Entry {n*chunk+j+1} of {l}.", end= '\r') self.insert_person(persons[j], conn) # print(f"Downloading entry {i} of {limit}", end= '\r') # page = mathPage(i) # self.insert_person(page) - conn.close() @@ -373,17 +520,55 @@ class mathGenealogy(Graph): def __init__(self, DB="MathGen.db", vertices = None, directed=True): self.db = mathDB(DB) super().__init__(directed=directed) - #self.vs["name"] = "" + self.vs["name"] = "" - def add_person(self, vertex): - if "name" in vertex.keys(): - self.add_vertex(vertex["name"]) - else: - print("Mandatory entry 'name' is missing") - return() - for key in vertex: - val = vertex[key] - print(f"Dictionary contains {key} with value {val}") + def add_person(self, mathID, root=0, line=0, force=False): + person = self.db.get_person(mathID) + if person[0] == 0: + p = mathPage(mathID) + p.get_info() + entry = p.get_entry() + if entry[1] == None: + raise ValueError("No such entry in the math genealogy project found.") + return + else: + self.db.insert_person(p) + person = self.db.get_person(mathID) + if str(mathID) not in self.vs['name']: + vID = self.add_vertex(str(mathID)).index + self._insert_person(person, vID, root, line) + + elif force: + vID = graph.vs.find(name=str(mathID)).index + self._insert_person(person, vID, root, line) + + + def _insert_person(self, person, vID, root, line): + students = json.loads(person[9]) + self.vs[vID]["Name"] = person[1] + self.vs[vID]["Title"] = person[2] + self.vs[vID]["Year"] = str(person[3]) + self.vs[vID]["Dissertation"] = person[4] + self.vs[vID]["Institution"] = person[5] + self.vs[vID]["Students"] = students["MathID"] + self.vs[vID]["Advisors"] = person[6:8] + self.vs[vID]["Line"] = line + self.vs[vID]["roots"] = [root] + + + + + + + + # if "name" in vertex.keys(): + # self.add_vertex(vertex["name"]) + # else: + # print("Mandatory entry 'name' is missing") + # return() + # for key in vertex: + # val = vertex[key] + # print(f"Dictionary contains {key} with value {val}") @@ -430,6 +615,17 @@ class mathGenealogy(Graph): total_entries = 297377 # number of records as of 2 October 2023 +Greven_ID = 29360 # Andreas Greven +Aumann_ID= 36548 # Georg Aumann +Anita_ID = 92324 # Anita +Fourier_ID = 17981 # Fourier +Wolfgang_ID = 150286 # Wolfgang +Eichelsbacher_ID = 27275 # Peter +Anton_ID = 125956 # Anton +Pfaffelhuber_ID = 157881 # Peter Pfaffelhuber +Ruess_ID = 75966 # Ruess + + if __name__ == "__main__": testDB = mathDB("test.db") test = mathPage(0)