sipb · psvenk · Dec 15, 2024 · Dec 11, 2024 · Dec 11, 2024 · Dec 12, 2024
diff --git a/.gitignore b/.gitignore
@@ -26,8 +26,10 @@ dist-ssr
 
 # artifacts
 scrapers/catalog.json
-scrapers/fireroad.json
+scrapers/fireroad-sem.json
+scrapers/fireroad-presem.json
 public/latest.json
+public/i25.json
 
 # python
 __pycache__

diff --git a/README.md b/README.md
@@ -38,7 +38,9 @@ Let's say you're updating from e.g. Spring 2023 to Fall 2023.
 
 First, archive the old semester. Make sure you have updated schedule files. Then run `mv public/latest.json public/s23.json`.
 
-Then, update the new semester. Open `public/latestTerm.json`, change `urlName` to `f23`, and update the dates per [Registrar](https://registrar.mit.edu/calendar).
+Then, update the new semester. Open `public/latestTerm.json`, change `urlName` to `m23` (for the "pre-semester" summer 2023) and `f23` (for the semester fall 2023), and update the dates per [Registrar](https://registrar.mit.edu/calendar).
+
+Next, update the `.gitignore` to ignore `public/m23.json` rather than `public/i23.json`.
 
 Finally, run the normal update process and commit the results to the repo.
 

diff --git a/public/latestTerm.json b/public/latestTerm.json
@@ -1,17 +1,27 @@
 {
-  "urlName": "s25",
-  "startDate": "2025-02-03",
-  "h1EndDate": "2025-03-21",
-  "h2StartDate": "2025-03-31",
-  "endDate": "2025-05-13",
-  "mondayScheduleDate": "2025-02-18",
-  "holidayDates": [
-    "2025-02-17",
-    "2025-03-24",
-    "2025-03-25",
-    "2025-03-26",
-    "2025-03-27",
-    "2025-03-28",
-    "2025-04-21"
-  ]
+  "preSemester": {
+    "urlName": "i25",
+    "startDate": "2025-01-06",
+    "endDate": "2025-01-31",
+    "holidayDates": [
+      "2025-01-20"
+    ]
+  },
+  "semester": {
+    "urlName": "s25",
+    "startDate": "2025-02-03",
+    "h1EndDate": "2025-03-21",
+    "h2StartDate": "2025-03-31",
+    "endDate": "2025-05-13",
+    "mondayScheduleDate": "2025-02-18",
+    "holidayDates": [
+      "2025-02-17",
+      "2025-03-24",
+      "2025-03-25",
+      "2025-03-26",
+      "2025-03-27",
+      "2025-03-28",
+      "2025-04-21"
+    ]
+  }
 }
diff --git a/scrapers/catalog.py b/scrapers/catalog.py
@@ -23,6 +23,19 @@
 BASE_URL = "http://student.mit.edu/catalog"
 
 
+def is_not_offered_this_year(html):
+    """
+    Args:
+    * html (BeautifulSoup): the input webpage
+
+    Returns:
+    * bool: True if the class is not offered this year
+    """
+    if html.find(attrs={"src": "/icns/nooffer.gif"}):
+        return True
+    return False
+
+
 def is_not_offered_next_year(html):
     """
     Args:
@@ -228,8 +241,9 @@ def scrape_courses_from_page(courses, href):
         filtered_html = BeautifulSoup()
         filtered_html.extend(content)
         course_data = get_course_data(filtered_html)
-        for course_num in course_nums:
-            courses[course_num] = course_data
+        if not is_not_offered_this_year(filtered_html):
+            for course_num in course_nums:
+                courses[course_num] = course_data
 
 
 def run():

diff --git a/scrapers/fireroad.py b/scrapers/fireroad.py
@@ -22,6 +22,7 @@
 import json
 import requests
 import utils
+from utils import Term
 
 URL = "https://fireroad.mit.edu/courses/all?full=true"
 
@@ -88,20 +89,19 @@ def parse_section(section):
     return [slots, place]
 
 
-def parse_schedule(course):
+def parse_schedule(schedule):
     """
     Parses the schedule string, which looks like:
     "Lecture,32-123/TR/0/11/F/0/2;Recitation,2-147/MW/0/10,2-142/MW/0/11"
 
     Args:
-    * course (dict[str, Union[bool, float, int, list[str], str]]): The course object.
+    * schedule (str): The schedule string.
 
     Returns:
     * dict[str, union[list, bool]: The parsed schedule
 
     Raises AssertionError or KeyError if parse_section does.
     """
-    schedule = course["schedule"]
     section_tba = False
     result = {}
 
@@ -205,18 +205,19 @@ def parse_prereqs(course):
     return {"prereqs": prereqs}
 
 
-def get_course_data(courses, course):
+def get_course_data(courses, course, term):
     """
     Parses a course from the Fireroad API, and puts it in courses. Skips the
-    courses Fireroad doesn't have schedule info for. Returns False if skipped,
+    courses that are not offered in the current term. Returns False if skipped,
     True otherwise. The `courses` variable is modified in place.
 
     Args:
     * courses (list[dict[str, Union[bool, float, int, list[str], str]]]): The list of courses.
     * course (dict[str, Union[bool, float, int, list[str], str]]): The course in particular.
+    * term (Term): The current term (fall, IAP, or spring).
 
     Returns:
-    * bool: Whether Fireroad has schedule information for this course.
+    * bool: Whether the course was entered into courses.
     """
     course_code = course["subject_id"]
     course_num, course_class = course_code.split(".")
@@ -226,41 +227,72 @@ def get_course_data(courses, course):
         "subject": course_class,
     }
 
-    if "schedule" not in course:
-        # TODO: Do something else with this?
-        return False
+    # terms, prereqs
+    raw_class.update(parse_terms(course))
+    raw_class.update(parse_prereqs(course))
 
-    # tb, s, l, r, b, lr, rr, br
-    try:
-        raw_class.update(parse_schedule(course))
-    except Exception as e:
-        # if we can't parse the schedule, warn
-        print(f"Can't parse schedule {course_code}: {e!r}")
+    if term.name not in raw_class["terms"]:
         return False
 
-    # hh, ha, hs, he, ci, cw, re, la, pl
+    has_schedule = "schedule" in course
+
+    # tba, sectionKinds, lectureSections, recitationSections, labSections,
+    # designSections, lectureRawSections, recitationRawSections, labRawSections,
+    # designRawSections
+    if has_schedule:
+        try:
+            if term == Term.FA and "scheduleFall" in course:
+                raw_class.update(parse_schedule(course["scheduleFall"]))
+            elif term == Term.JA and "scheduleIAP" in course:
+                raw_class.update(parse_schedule(course["scheduleIAP"]))
+            elif term == Term.SP and "scheduleSpring" in course:
+                raw_class.update(parse_schedule(course["scheduleSpring"]))
+            else:
+                raw_class.update(parse_schedule(course["schedule"]))
+        except Exception as e:
+            # if we can't parse the schedule, warn
+            print(f"Can't parse schedule {course_code}: {e!r}")
+            has_schedule = False
+    if not has_schedule:
+        raw_class.update(
+            {
+                "tba": False,
+                "sectionKinds": [],
+                "lectureSections": [],
+                "recitationSections": [],
+                "labSections": [],
+                "designSections": [],
+                "lectureRawSections": [],
+                "recitationRawSections": [],
+                "labRawSections": [],
+                "designRawSections": [],
+            }
+        )
+
+    # hassH, hassA, hassS, hassE, cih, cihw, rest, lab, partLab
     raw_class.update(parse_attributes(course))
-    raw_class.update(
-        {
-            "lectureUnits": course["lecture_units"],
-            "labUnits": course["lab_units"],
-            "preparationUnits": course["preparation_units"],
-            "level": course["level"],
-            "isVariableUnits": course["is_variable_units"],
-            "same": ", ".join(course.get("joint_subjects", [])),
-            "meets": ", ".join(course.get("meets_with_subjects", [])),
-        }
-    )
-    # This should be the case with variable-units classes, but just to make sure.
+    try:
+        raw_class.update(
+            {
+                "lectureUnits": course["lecture_units"],
+                "labUnits": course["lab_units"],
+                "preparationUnits": course["preparation_units"],
+                "level": course["level"],
+                "isVariableUnits": course["is_variable_units"],
+                "same": ", ".join(course.get("joint_subjects", [])),
+                "meets": ", ".join(course.get("meets_with_subjects", [])),
+            }
+        )
+    except KeyError as e:
+        print(f"Can't parse {course_code}: {e!r}")
+        return False
+    # This should be the case with variable-units classes, but just to make
+    # sure.
     if raw_class["isVariableUnits"]:
         assert raw_class["lectureUnits"] == 0
         assert raw_class["labUnits"] == 0
         assert raw_class["preparationUnits"] == 0
 
-    # t, pr
-    raw_class.update(parse_terms(course))
-    raw_class.update(parse_prereqs(course))
-
     raw_class.update(
         {
             "description": course.get("description", ""),
@@ -271,7 +303,7 @@ def get_course_data(courses, course):
         }
     )
 
-    # nx, rp, u, f, hf, lm are from catalog.json, not here
+    # nonext, repeat, url, final, half, limited are from catalog.json, not here
 
     if "old_id" in course:
         raw_class["oldNumber"] = course["old_id"]
@@ -289,27 +321,33 @@ def get_course_data(courses, course):
     return True
 
 
-def run():
+def run(is_semester_term):
     """
     The main entry point. All data is written to `fireroad.json`.
 
-    There are no arguments and there is no return value.
+    Args:
+    * is_semester_term (bool): whether to look at the semester term (fall/spring) or the pre-semester term (summer/IAP).
+
+    Returns: none
     """
     text = requests.get(URL).text
     data = json.loads(text)
     courses = dict()
+    term = utils.url_name_to_term(utils.get_term_info(is_semester_term)["urlName"])
+    fname = "fireroad-sem.json" if is_semester_term else "fireroad-presem.json"
     missing = 0
 
     for course in data:
-        has_schedule = get_course_data(courses, course)
-        if not has_schedule:
+        included = get_course_data(courses, course, term)
+        if not included:
             missing += 1
 
-    with open("fireroad.json", "w") as f:
+    with open(fname, "w") as f:
         json.dump(courses, f)
     print(f"Got {len (courses)} courses")
-    print(f"Skipped {missing} courses due to missing schedules")
+    print(f"Skipped {missing} courses that are not offered in the {term.value} term")
 
 
 if __name__ == "__main__":
-    run()
+    run(False)
+    run(True)
diff --git a/scrapers/package.py b/scrapers/package.py
@@ -58,29 +58,46 @@ def run():
     Takes data from fireroad.json and catalog.json; outputs latest.json.
     There are no arguments and no return value.
     """
-    fireroad = load_json_data("fireroad.json")
+    fireroad_presem = load_json_data("fireroad-presem.json")
+    fireroad_sem = load_json_data("fireroad-sem.json")
     catalog = load_json_data("catalog.json")
     overrides = load_json_data("overrides.json")
 
     # The key needs to be in BOTH fireroad and catalog to make it:
-    # If it's not in Fireroad, we don't have its schedule.
-    # If it's not in catalog, it's not offered this semester.
-    courses = merge_data(
-        datasets=[fireroad, catalog, overrides],
-        keys_to_keep=set(fireroad) & set(catalog),
+    # If it's not in Fireroad, it's not offered in this semester (fall, etc.).
+    # If it's not in catalog, it's not offered this year.
+    courses_presem = merge_data(
+        datasets=[fireroad_presem, catalog, overrides],
+        keys_to_keep=set(fireroad_presem) & set(catalog),
+    )
+    courses_sem = merge_data(
+        datasets=[fireroad_sem, catalog, overrides],
+        keys_to_keep=set(fireroad_sem) & set(catalog),
     )
 
-    term_info = utils.get_term_info()
+    term_info_presem = utils.get_term_info(False)
+    url_name_presem = term_info_presem["urlName"]
+    term_info_sem = utils.get_term_info(True)
+    url_name_sem = term_info_sem["urlName"]
     now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")
-    obj = {
-        "termInfo": term_info,
+
+    obj_presem = {
+        "termInfo": term_info_presem,
+        "lastUpdated": now,
+        "classes": courses_presem,
+    }
+    obj_sem = {
+        "termInfo": term_info_sem,
         "lastUpdated": now,
-        "classes": courses,
+        "classes": courses_sem,
     }
 
+    with open(f"../public/{url_name_presem}.json", mode="w", encoding="utf-8") as f:
+        json.dump(obj_presem, f, separators=(",", ":"))
     with open("../public/latest.json", mode="w", encoding="utf-8") as f:
-        json.dump(obj, f, separators=(",", ":"))
-    print(f"Got {len(courses)} courses")
+        json.dump(obj_sem, f, separators=(",", ":"))
+    print(f"{url_name_presem}: got {len(courses_presem)} courses")
+    print(f"{url_name_sem}: got {len(courses_sem)} courses")
 
 
 if __name__ == "__main__":

diff --git a/scrapers/update.py b/scrapers/update.py
@@ -16,8 +16,10 @@ def run():
     """
     This function is the entry point. There are no arguments.
     """
-    print("=== Update fireroad data ===")
-    fireroad.run()
+    print("=== Update fireroad data (pre-semester) ===")
+    fireroad.run(False)
+    print("=== Update fireroad data (semester) ===")
+    fireroad.run(True)
     print("=== Update catalog data ===")
     catalog.run()
     print("=== Packaging ===")