librus_apix.schedule
This module provides functions for retrieving schedule information from the Librus site, parsing it, and formatting it into a structured representation.
Classes: - Event: Represents an event in the schedule with various attributes like title, subject, day, etc.
Functions: - schedule_detail: Fetches detailed schedule information for a specific prefix and detail URL suffix. - get_schedule: Fetches the schedule for a specific month and year.
Usage:
from librus_apix.client import new_client
# Create a new client instance
client = new_client()
client.get_token(username, password)
# Fetch the schedule for a specific month and year
month = "01"
year = "2024"
include_empty = True
monthly_schedule = get_schedule(client, month, year, include_empty)
# Fetch detailed schedule information
day_one = monthly_schedule[1].href
prefix, suffix = day_one.split("/")
detailed_schedule = schedule_detail(client, prefix, detail_url)
1""" 2This module provides functions for retrieving schedule information from the Librus site, parsing it, and formatting it into a structured representation. 3 4Classes: 5 - Event: Represents an event in the schedule with various attributes like title, subject, day, etc. 6 7Functions: 8 - schedule_detail: Fetches detailed schedule information for a specific prefix and detail URL suffix. 9 - get_schedule: Fetches the schedule for a specific month and year. 10 11Usage: 12 ```python 13 from librus_apix.client import new_client 14 15 # Create a new client instance 16 client = new_client() 17 client.get_token(username, password) 18 19 20 # Fetch the schedule for a specific month and year 21 month = "01" 22 year = "2024" 23 include_empty = True 24 monthly_schedule = get_schedule(client, month, year, include_empty) 25 26 # Fetch detailed schedule information 27 day_one = monthly_schedule[1].href 28 prefix, suffix = day_one.split("/") 29 detailed_schedule = schedule_detail(client, prefix, detail_url) 30 ``` 31""" 32 33import re 34from collections import defaultdict 35from dataclasses import dataclass 36from typing import DefaultDict, Dict, List, Union 37 38from bs4 import BeautifulSoup, NavigableString, Tag 39 40from librus_apix.client import Client 41from librus_apix.exceptions import ParseError 42from librus_apix.helpers import no_access_check 43 44 45@dataclass 46class Event: 47 """ 48 Represents an event in the schedule. 49 50 Attributes: 51 title (str): The title of the event. 52 subject (str): The subject of the event. 53 data (dict): Additional data associated with the event. 54 day (str): The day on which the event occurs. 55 number (Union[int, str]): The number associated with the event. 56 hour (str): The hour at which the event occurs. 57 href (str): 'prefix'/'suffix' joined with a slash (this should be reworked...). 58 """ 59 60 title: str 61 subject: str 62 data: dict 63 day: str 64 number: Union[int, str] 65 hour: str 66 href: str 67 68 69def schedule_detail(client: Client, prefix: str, detail_url: str) -> Dict[str, str]: 70 """ 71 Fetches the detailed schedule information for a specific prefix and detail URL suffix. 72 73 Args: 74 client (Client): The client object for making HTTP requests. 75 prefix (str): The prefix of the schedule URL. 76 detail_url (str): The detail URL of the schedule. 77 78 Returns: 79 Dict[str, str]: A dictionary containing schedule details. 80 """ 81 schedule = {} 82 div = no_access_check( 83 BeautifulSoup( 84 client.get(client.SCHEDULE_URL + prefix + "/" + detail_url).text, "lxml" 85 ) 86 ).find("div", attrs={"class": "container-background"}) 87 88 if div is None or isinstance(div, NavigableString): 89 raise ParseError("Error in parsing schedule details.") 90 tr: List[Tag] = div.find_all("tr", attrs={"class": ["line0", "line1"]}) 91 for s in tr: 92 th = s.find("th") 93 td = s.find("td") 94 if td is None or th is None: 95 continue 96 schedule[th.text.strip()] = td.text.strip() 97 return schedule 98 99 100def _parse_title_into_pairs(title: str) -> Dict[str, str]: 101 additional_data = {} 102 pairs = [pair.split(":", 1) for pair in title.split("<br />")] 103 for pair in pairs: 104 if len(pair) != 2: 105 additional_data[pair[0].strip()] = "unknown" 106 continue 107 key, val = pair 108 additional_data[key.strip()] = val.strip() 109 110 return additional_data 111 112 113def get_schedule( 114 client: Client, month: str, year: str, include_empty: bool = False 115) -> DefaultDict[int, List[Event]]: 116 """ 117 Fetches the schedule for a specific month and year. 118 119 Args: 120 client (Client): The client object for making HTTP requests. 121 month (str): The month for which the schedule is requested. 122 year (str): The year for which the schedule is requested. 123 include_empty (bool, optional): Flag to include empty schedules. Defaults to False. 124 125 Returns: 126 DefaultDict[int, List[Event]]: A dictionary containing the schedule for each day of the month. 127 """ 128 schedule = defaultdict(list) 129 soup = no_access_check( 130 BeautifulSoup( 131 client.post(client.SCHEDULE_URL, data={"rok": year, "miesiac": month}).text, 132 "lxml", 133 ) 134 ) 135 days = soup.find_all("div", attrs={"class": "kalendarz-dzien"}) 136 if len(days) < 1: 137 raise ParseError("Error in parsing days of the schedule.") 138 for day in days: 139 try: 140 d = int(day.find("div", attrs={"class": "kalendarz-numer-dnia"}).text) 141 except: 142 raise ParseError("Error while parsing day number") 143 if include_empty == True: 144 schedule[d] = [] 145 tr: List[Tag] = day.find_all("tr") 146 for event in tr: 147 td = event.find("td") 148 if td is None or isinstance(td, NavigableString): 149 continue 150 title = td.attrs.get("title", "Nauczyciel: unknown<br />Opis: unknown") 151 additional_data = _parse_title_into_pairs(title) 152 subject = "unspecified" 153 span = td.find("span") 154 if span is not None: 155 subject = span.text 156 span.extract() 157 158 delimeter = "###" 159 for line in td.select("br"): 160 line.replaceWith(delimeter) 161 data = ( 162 td.text.replace("\xa0", " ") 163 .replace(", ", "") 164 .replace("\n", "") 165 .strip() 166 .split(delimeter) 167 ) 168 if subject == "unspecified": 169 subject = data[0] 170 if len(data) >= 2: 171 title = data[1] 172 else: 173 title = data[0] 174 175 number = "unknown" 176 hour = "unknown" 177 number_td = event.find("td") 178 if number_td is None or isinstance(number_td, NavigableString): 179 raise ParseError("Error while parsing td_number schedule.") 180 try: 181 number = int( 182 re.findall(r": ?[0-99]?[0-99]", number_td.text)[0].replace(": ", "") 183 ) 184 except ValueError: 185 hour = re.findall(r" ?[0-2]?[0-9]:?[0-5]?[0-9]", number_td.text)[0] 186 except IndexError: 187 pass 188 onclick = number_td.attrs.get("onclick", "'") 189 href = onclick.split("'")[1].split("/") 190 if len(href) >= 2: 191 href = "/".join(href[2:]) 192 else: 193 href = "" 194 195 event = Event(title, subject, additional_data, str(d), number, hour, href) 196 schedule[d].append(event) 197 return schedule 198 199 200@dataclass 201class RecentEvent: 202 """ 203 The events inside recent_schedule differ a little bit 204 the .data should contain event name, date from to and duration 205 I might be able to extract into separate values if I get html 206 """ 207 208 date_added: str 209 type: str 210 data: str 211 212 213def _sanitize_data(data: str) -> str: 214 return ( 215 data.replace(" ", " ") 216 .replace("<br/>", "<br>") 217 .replace("<br>", "\n") 218 .strip() 219 ) 220 221 222def get_recently_added_schedule(client: Client) -> List[RecentEvent]: 223 """ 224 Events can be viewed only once here, any subsequent call won't have same events 225 Made blindly based on a screenshot, still untested... 226 """ 227 events = [] 228 soup = no_access_check( 229 BeautifulSoup( 230 client.get(client.RECENT_SCHEDULE_URL).text, 231 "lxml", 232 ) 233 ) 234 bg = soup.select_one("div.container-background") 235 if bg is None: 236 raise ParseError("Unable to locate recent schedule container-background") 237 table = soup.select_one("table") 238 if table is None: 239 return [] 240 rows = table.select("tr") 241 for row in rows: 242 tds = row.select("td") 243 if len(tds) != 4: 244 continue 245 _, date_added, _type, data = tds 246 data = _sanitize_data(data.text) 247 # unsure about that so we'll check 248 if "czas dodania" in date_added and "rodzaj zdarzenia" in _type: 249 continue 250 event = RecentEvent(date_added.text.strip(), _type.text.strip(), data) 251 events.append(event) 252 return events
46@dataclass 47class Event: 48 """ 49 Represents an event in the schedule. 50 51 Attributes: 52 title (str): The title of the event. 53 subject (str): The subject of the event. 54 data (dict): Additional data associated with the event. 55 day (str): The day on which the event occurs. 56 number (Union[int, str]): The number associated with the event. 57 hour (str): The hour at which the event occurs. 58 href (str): 'prefix'/'suffix' joined with a slash (this should be reworked...). 59 """ 60 61 title: str 62 subject: str 63 data: dict 64 day: str 65 number: Union[int, str] 66 hour: str 67 href: str
Represents an event in the schedule.
Attributes: title (str): The title of the event. subject (str): The subject of the event. data (dict): Additional data associated with the event. day (str): The day on which the event occurs. number (Union[int, str]): The number associated with the event. hour (str): The hour at which the event occurs. href (str): 'prefix'/'suffix' joined with a slash (this should be reworked...).
70def schedule_detail(client: Client, prefix: str, detail_url: str) -> Dict[str, str]: 71 """ 72 Fetches the detailed schedule information for a specific prefix and detail URL suffix. 73 74 Args: 75 client (Client): The client object for making HTTP requests. 76 prefix (str): The prefix of the schedule URL. 77 detail_url (str): The detail URL of the schedule. 78 79 Returns: 80 Dict[str, str]: A dictionary containing schedule details. 81 """ 82 schedule = {} 83 div = no_access_check( 84 BeautifulSoup( 85 client.get(client.SCHEDULE_URL + prefix + "/" + detail_url).text, "lxml" 86 ) 87 ).find("div", attrs={"class": "container-background"}) 88 89 if div is None or isinstance(div, NavigableString): 90 raise ParseError("Error in parsing schedule details.") 91 tr: List[Tag] = div.find_all("tr", attrs={"class": ["line0", "line1"]}) 92 for s in tr: 93 th = s.find("th") 94 td = s.find("td") 95 if td is None or th is None: 96 continue 97 schedule[th.text.strip()] = td.text.strip() 98 return schedule
Fetches the detailed schedule information for a specific prefix and detail URL suffix.
Args: client (Client): The client object for making HTTP requests. prefix (str): The prefix of the schedule URL. detail_url (str): The detail URL of the schedule.
Returns: Dict[str, str]: A dictionary containing schedule details.
114def get_schedule( 115 client: Client, month: str, year: str, include_empty: bool = False 116) -> DefaultDict[int, List[Event]]: 117 """ 118 Fetches the schedule for a specific month and year. 119 120 Args: 121 client (Client): The client object for making HTTP requests. 122 month (str): The month for which the schedule is requested. 123 year (str): The year for which the schedule is requested. 124 include_empty (bool, optional): Flag to include empty schedules. Defaults to False. 125 126 Returns: 127 DefaultDict[int, List[Event]]: A dictionary containing the schedule for each day of the month. 128 """ 129 schedule = defaultdict(list) 130 soup = no_access_check( 131 BeautifulSoup( 132 client.post(client.SCHEDULE_URL, data={"rok": year, "miesiac": month}).text, 133 "lxml", 134 ) 135 ) 136 days = soup.find_all("div", attrs={"class": "kalendarz-dzien"}) 137 if len(days) < 1: 138 raise ParseError("Error in parsing days of the schedule.") 139 for day in days: 140 try: 141 d = int(day.find("div", attrs={"class": "kalendarz-numer-dnia"}).text) 142 except: 143 raise ParseError("Error while parsing day number") 144 if include_empty == True: 145 schedule[d] = [] 146 tr: List[Tag] = day.find_all("tr") 147 for event in tr: 148 td = event.find("td") 149 if td is None or isinstance(td, NavigableString): 150 continue 151 title = td.attrs.get("title", "Nauczyciel: unknown<br />Opis: unknown") 152 additional_data = _parse_title_into_pairs(title) 153 subject = "unspecified" 154 span = td.find("span") 155 if span is not None: 156 subject = span.text 157 span.extract() 158 159 delimeter = "###" 160 for line in td.select("br"): 161 line.replaceWith(delimeter) 162 data = ( 163 td.text.replace("\xa0", " ") 164 .replace(", ", "") 165 .replace("\n", "") 166 .strip() 167 .split(delimeter) 168 ) 169 if subject == "unspecified": 170 subject = data[0] 171 if len(data) >= 2: 172 title = data[1] 173 else: 174 title = data[0] 175 176 number = "unknown" 177 hour = "unknown" 178 number_td = event.find("td") 179 if number_td is None or isinstance(number_td, NavigableString): 180 raise ParseError("Error while parsing td_number schedule.") 181 try: 182 number = int( 183 re.findall(r": ?[0-99]?[0-99]", number_td.text)[0].replace(": ", "") 184 ) 185 except ValueError: 186 hour = re.findall(r" ?[0-2]?[0-9]:?[0-5]?[0-9]", number_td.text)[0] 187 except IndexError: 188 pass 189 onclick = number_td.attrs.get("onclick", "'") 190 href = onclick.split("'")[1].split("/") 191 if len(href) >= 2: 192 href = "/".join(href[2:]) 193 else: 194 href = "" 195 196 event = Event(title, subject, additional_data, str(d), number, hour, href) 197 schedule[d].append(event) 198 return schedule
Fetches the schedule for a specific month and year.
Args: client (Client): The client object for making HTTP requests. month (str): The month for which the schedule is requested. year (str): The year for which the schedule is requested. include_empty (bool, optional): Flag to include empty schedules. Defaults to False.
Returns: DefaultDict[int, List[Event]]: A dictionary containing the schedule for each day of the month.
201@dataclass 202class RecentEvent: 203 """ 204 The events inside recent_schedule differ a little bit 205 the .data should contain event name, date from to and duration 206 I might be able to extract into separate values if I get html 207 """ 208 209 date_added: str 210 type: str 211 data: str
The events inside recent_schedule differ a little bit the .data should contain event name, date from to and duration I might be able to extract into separate values if I get html
223def get_recently_added_schedule(client: Client) -> List[RecentEvent]: 224 """ 225 Events can be viewed only once here, any subsequent call won't have same events 226 Made blindly based on a screenshot, still untested... 227 """ 228 events = [] 229 soup = no_access_check( 230 BeautifulSoup( 231 client.get(client.RECENT_SCHEDULE_URL).text, 232 "lxml", 233 ) 234 ) 235 bg = soup.select_one("div.container-background") 236 if bg is None: 237 raise ParseError("Unable to locate recent schedule container-background") 238 table = soup.select_one("table") 239 if table is None: 240 return [] 241 rows = table.select("tr") 242 for row in rows: 243 tds = row.select("td") 244 if len(tds) != 4: 245 continue 246 _, date_added, _type, data = tds 247 data = _sanitize_data(data.text) 248 # unsure about that so we'll check 249 if "czas dodania" in date_added and "rodzaj zdarzenia" in _type: 250 continue 251 event = RecentEvent(date_added.text.strip(), _type.text.strip(), data) 252 events.append(event) 253 return events
Events can be viewed only once here, any subsequent call won't have same events Made blindly based on a screenshot, still untested...