From d64b42a0c837d0d2a076bbd81586c39dfb2622a4 Mon Sep 17 00:00:00 2001 From: Galen Guyer Date: Thu, 19 Aug 2021 12:37:33 -0400 Subject: move scraping into a command --- poller/__init__.py | 92 ------------------------------------------------------ poller/commands.py | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 92 deletions(-) diff --git a/poller/__init__.py b/poller/__init__.py index 24b8e95..e8a108e 100644 --- a/poller/__init__.py +++ b/poller/__init__.py @@ -5,19 +5,11 @@ Startup code import os import json import logging -import requests import datetime -import threading from flask import Flask -from bs4 import BeautifulSoup from flask_sqlalchemy import SQLAlchemy from flask_migrate import Migrate - -POOL_TIME = 5 * 60 # Seconds -DASHBOARD_URL = 'https://rit.edu/ready/summer-dashboard' -DATA_THREAD = threading.Thread() - APP = Flask(__name__) # Load default configuration and any environment variable overrides @@ -45,87 +37,3 @@ from . import commands from . import routes db.create_all() - -from .models import Day - -def data_are_same(old, new): - return old.total_students == new.total_students and \ - old.total_staff == new.total_staff and \ - old.new_students == new.new_students and \ - old.new_staff == new.new_staff and \ - old.quarantine_on_campus == new.quarantine_on_campus and \ - old.quarantine_off_campus == new.quarantine_off_campus and \ - old.isolation_on_campus == new.isolation_on_campus and \ - old.isolation_off_campus == new.isolation_off_campus and \ - old.beds_available == new.beds_available and \ - old.tests_administered == new.tests_administered and \ - old.alert_level == new.alert_level - - -def get_data(): - print('fetching data') - global DATA_THREAD - DATA_THREAD = threading.Timer(POOL_TIME, get_data, ()) - DATA_THREAD.start() - page = requests.get(DASHBOARD_URL, headers={'Cache-Control': 'no-cache'}) - soup = BeautifulSoup(page.content, 'html.parser') - #total_students = int(soup.find('div', attrs={'class': 'statistic-13872'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip()) - #total_staff = int(soup.find('div', attrs={'class': 'statistic-13875'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip()) - new_students = int(soup.find('div', attrs={'class': 'statistic-14884'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip()) - new_staff = int(soup.find('div', attrs={'class': 'statistic-14887'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip()) - #quarantine_on_campus = int(soup.find('div', attrs={'class': 'statistic-13893'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip()) - #quarantine_off_campus = int(soup.find('div', attrs={'class': 'statistic-13896'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip()) - #isolation_on_campus = int(soup.find('div', attrs={'class': 'statistic-13905'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip()) - #isolation_off_campus = int(soup.find('div', attrs={'class': 'statistic-13908'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip()) - #beds_available = int(soup.find('div', attrs={'class': 'statistic-13935'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip().strip('%')) - #tests_administered = int(soup.find('div', attrs={'class': 'statistic-13923'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip().replace("*", " ").replace(",", "")) - #container = soup.find('div', attrs={'id': 'pandemic-message-container'}) - #alert_level = container.find('a').text - color = "" - #if "Green" in alert_level: - # color = 'green' - #elif "Yellow" in alert_level: - # color = 'yellow' - #elif "Orange" in alert_level: - # color = 'orange' - #elif "Red" in alert_level: - # color = 'red' - - #fall_data = None - #with open('history/fall-2020.json', 'r') as fd: - # fall_data = json.loads(fd.read()) - current_data = Day( - last_updated=datetime.datetime.now(), - alert_level=color, - beds_available=-1, - isolation_off_campus=-1, - isolation_on_campus=-1, - new_staff=new_staff, - new_students=new_students, - quarantine_off_campus=-1, - quarantine_on_campus=-1, - tests_administered=-1, - total_staff=-1, - total_students=-1) - print(current_data.serialize()) - try: - if not data_are_same(Day.get_all()[-1], current_data): - db.session.add(current_data) - except IndexError: - db.session.add(current_data) - dedup() - return current_data - -def dedup(): - data = Day.get_all() - # get first date - starting_date = data[-1].serialize()['last_updated'].split(' ')[0] - for i in range(len(data)-2, 0, -1): - if data[i].serialize()['last_updated'].split(' ')[0] != starting_date: - starting_date = data[i].serialize()['last_updated'].split(' ')[0] - else: - db.session.delete(data[i]) - print('dropped ' + data[i].serialize()['last_updated']) - db.session.commit() - -get_data() diff --git a/poller/commands.py b/poller/commands.py index fc45dc7..c983578 100644 --- a/poller/commands.py +++ b/poller/commands.py @@ -3,10 +3,16 @@ CLI commands for data management """ import json import click +import time +import requests +import datetime from dateutil import parser +from bs4 import BeautifulSoup from . import APP, db from .models import Day +DASHBOARD_URL = 'https://rit.edu/ready/summer-dashboard' + @APP.cli.command('import-history') @click.argument('history_file') @@ -34,3 +40,87 @@ def import_history(history_file): total_staff=item['total_staff'], total_students=item['total_students'])) db.session.commit() + + +@APP.cli.command('scrape') +def scrape(): + while True: + get_data() + time.sleep(5*60) + +def data_are_same(old, new): + return old.total_students == new.total_students and \ + old.total_staff == new.total_staff and \ + old.new_students == new.new_students and \ + old.new_staff == new.new_staff and \ + old.quarantine_on_campus == new.quarantine_on_campus and \ + old.quarantine_off_campus == new.quarantine_off_campus and \ + old.isolation_on_campus == new.isolation_on_campus and \ + old.isolation_off_campus == new.isolation_off_campus and \ + old.beds_available == new.beds_available and \ + old.tests_administered == new.tests_administered and \ + old.alert_level == new.alert_level + + +def get_data(): + print('fetching data') + page = requests.get(DASHBOARD_URL, headers={'Cache-Control': 'no-cache'}) + soup = BeautifulSoup(page.content, 'html.parser') + #total_students = int(soup.find('div', attrs={'class': 'statistic-13872'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip()) + #total_staff = int(soup.find('div', attrs={'class': 'statistic-13875'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip()) + new_students = int(soup.find('div', attrs={'class': 'statistic-14884'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip()) + new_staff = int(soup.find('div', attrs={'class': 'statistic-14887'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip()) + #quarantine_on_campus = int(soup.find('div', attrs={'class': 'statistic-13893'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip()) + #quarantine_off_campus = int(soup.find('div', attrs={'class': 'statistic-13896'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip()) + #isolation_on_campus = int(soup.find('div', attrs={'class': 'statistic-13905'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip()) + #isolation_off_campus = int(soup.find('div', attrs={'class': 'statistic-13908'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip()) + #beds_available = int(soup.find('div', attrs={'class': 'statistic-13935'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip().strip('%')) + #tests_administered = int(soup.find('div', attrs={'class': 'statistic-13923'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip().replace("*", " ").replace(",", "")) + #container = soup.find('div', attrs={'id': 'pandemic-message-container'}) + #alert_level = container.find('a').text + color = "" + #if "Green" in alert_level: + # color = 'green' + #elif "Yellow" in alert_level: + # color = 'yellow' + #elif "Orange" in alert_level: + # color = 'orange' + #elif "Red" in alert_level: + # color = 'red' + + #fall_data = None + #with open('history/fall-2020.json', 'r') as fd: + # fall_data = json.loads(fd.read()) + current_data = Day( + last_updated=datetime.datetime.now(), + alert_level=color, + beds_available=-1, + isolation_off_campus=-1, + isolation_on_campus=-1, + new_staff=new_staff, + new_students=new_students, + quarantine_off_campus=-1, + quarantine_on_campus=-1, + tests_administered=-1, + total_staff=-1, + total_students=-1) + print(current_data.serialize()) + try: + if not data_are_same(Day.get_all()[-1], current_data): + db.session.add(current_data) + except IndexError: + db.session.add(current_data) + dedup() + return current_data + +def dedup(): + data = Day.get_all() + # get first date + starting_date = data[-1].serialize()['last_updated'].split(' ')[0] + for i in range(len(data)-2, 0, -1): + if data[i].serialize()['last_updated'].split(' ')[0] != starting_date: + starting_date = data[i].serialize()['last_updated'].split(' ')[0] + else: + db.session.delete(data[i]) + print('dropped ' + data[i].serialize()['last_updated']) + db.session.commit() -- cgit v1.2.3