aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGalen Guyer <galen@galenguyer.com>2021-08-19 12:37:33 -0400
committerGalen Guyer <galen@galenguyer.com>2021-08-19 12:37:33 -0400
commitd64b42a0c837d0d2a076bbd81586c39dfb2622a4 (patch)
treef3bec68a66f13368ea13a7766ea34e1194b4ad73
parent4bc585350a1b76606b50fe41d0aa699621dac25e (diff)
move scraping into a command
-rw-r--r--poller/__init__.py92
-rw-r--r--poller/commands.py90
2 files changed, 90 insertions, 92 deletions
diff --git a/poller/__init__.py b/poller/__init__.py
index 24b8e95..e8a108e 100644
--- a/poller/__init__.py
+++ b/poller/__init__.py
@@ -5,19 +5,11 @@ Startup code
import os
import json
import logging
-import requests
import datetime
-import threading
from flask import Flask
-from bs4 import BeautifulSoup
from flask_sqlalchemy import SQLAlchemy
from flask_migrate import Migrate
-
-POOL_TIME = 5 * 60 # Seconds
-DASHBOARD_URL = 'https://rit.edu/ready/summer-dashboard'
-DATA_THREAD = threading.Thread()
-
APP = Flask(__name__)
# Load default configuration and any environment variable overrides
@@ -45,87 +37,3 @@ from . import commands
from . import routes
db.create_all()
-
-from .models import Day
-
-def data_are_same(old, new):
- return old.total_students == new.total_students and \
- old.total_staff == new.total_staff and \
- old.new_students == new.new_students and \
- old.new_staff == new.new_staff and \
- old.quarantine_on_campus == new.quarantine_on_campus and \
- old.quarantine_off_campus == new.quarantine_off_campus and \
- old.isolation_on_campus == new.isolation_on_campus and \
- old.isolation_off_campus == new.isolation_off_campus and \
- old.beds_available == new.beds_available and \
- old.tests_administered == new.tests_administered and \
- old.alert_level == new.alert_level
-
-
-def get_data():
- print('fetching data')
- global DATA_THREAD
- DATA_THREAD = threading.Timer(POOL_TIME, get_data, ())
- DATA_THREAD.start()
- page = requests.get(DASHBOARD_URL, headers={'Cache-Control': 'no-cache'})
- soup = BeautifulSoup(page.content, 'html.parser')
- #total_students = int(soup.find('div', attrs={'class': 'statistic-13872'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip())
- #total_staff = int(soup.find('div', attrs={'class': 'statistic-13875'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip())
- new_students = int(soup.find('div', attrs={'class': 'statistic-14884'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip())
- new_staff = int(soup.find('div', attrs={'class': 'statistic-14887'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip())
- #quarantine_on_campus = int(soup.find('div', attrs={'class': 'statistic-13893'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip())
- #quarantine_off_campus = int(soup.find('div', attrs={'class': 'statistic-13896'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip())
- #isolation_on_campus = int(soup.find('div', attrs={'class': 'statistic-13905'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip())
- #isolation_off_campus = int(soup.find('div', attrs={'class': 'statistic-13908'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip())
- #beds_available = int(soup.find('div', attrs={'class': 'statistic-13935'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip().strip('%'))
- #tests_administered = int(soup.find('div', attrs={'class': 'statistic-13923'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip().replace("*", " ").replace(",", ""))
- #container = soup.find('div', attrs={'id': 'pandemic-message-container'})
- #alert_level = container.find('a').text
- color = ""
- #if "Green" in alert_level:
- # color = 'green'
- #elif "Yellow" in alert_level:
- # color = 'yellow'
- #elif "Orange" in alert_level:
- # color = 'orange'
- #elif "Red" in alert_level:
- # color = 'red'
-
- #fall_data = None
- #with open('history/fall-2020.json', 'r') as fd:
- # fall_data = json.loads(fd.read())
- current_data = Day(
- last_updated=datetime.datetime.now(),
- alert_level=color,
- beds_available=-1,
- isolation_off_campus=-1,
- isolation_on_campus=-1,
- new_staff=new_staff,
- new_students=new_students,
- quarantine_off_campus=-1,
- quarantine_on_campus=-1,
- tests_administered=-1,
- total_staff=-1,
- total_students=-1)
- print(current_data.serialize())
- try:
- if not data_are_same(Day.get_all()[-1], current_data):
- db.session.add(current_data)
- except IndexError:
- db.session.add(current_data)
- dedup()
- return current_data
-
-def dedup():
- data = Day.get_all()
- # get first date
- starting_date = data[-1].serialize()['last_updated'].split(' ')[0]
- for i in range(len(data)-2, 0, -1):
- if data[i].serialize()['last_updated'].split(' ')[0] != starting_date:
- starting_date = data[i].serialize()['last_updated'].split(' ')[0]
- else:
- db.session.delete(data[i])
- print('dropped ' + data[i].serialize()['last_updated'])
- db.session.commit()
-
-get_data()
diff --git a/poller/commands.py b/poller/commands.py
index fc45dc7..c983578 100644
--- a/poller/commands.py
+++ b/poller/commands.py
@@ -3,10 +3,16 @@ CLI commands for data management
"""
import json
import click
+import time
+import requests
+import datetime
from dateutil import parser
+from bs4 import BeautifulSoup
from . import APP, db
from .models import Day
+DASHBOARD_URL = 'https://rit.edu/ready/summer-dashboard'
+
@APP.cli.command('import-history')
@click.argument('history_file')
@@ -34,3 +40,87 @@ def import_history(history_file):
total_staff=item['total_staff'],
total_students=item['total_students']))
db.session.commit()
+
+
+@APP.cli.command('scrape')
+def scrape():
+ while True:
+ get_data()
+ time.sleep(5*60)
+
+def data_are_same(old, new):
+ return old.total_students == new.total_students and \
+ old.total_staff == new.total_staff and \
+ old.new_students == new.new_students and \
+ old.new_staff == new.new_staff and \
+ old.quarantine_on_campus == new.quarantine_on_campus and \
+ old.quarantine_off_campus == new.quarantine_off_campus and \
+ old.isolation_on_campus == new.isolation_on_campus and \
+ old.isolation_off_campus == new.isolation_off_campus and \
+ old.beds_available == new.beds_available and \
+ old.tests_administered == new.tests_administered and \
+ old.alert_level == new.alert_level
+
+
+def get_data():
+ print('fetching data')
+ page = requests.get(DASHBOARD_URL, headers={'Cache-Control': 'no-cache'})
+ soup = BeautifulSoup(page.content, 'html.parser')
+ #total_students = int(soup.find('div', attrs={'class': 'statistic-13872'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip())
+ #total_staff = int(soup.find('div', attrs={'class': 'statistic-13875'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip())
+ new_students = int(soup.find('div', attrs={'class': 'statistic-14884'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip())
+ new_staff = int(soup.find('div', attrs={'class': 'statistic-14887'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip())
+ #quarantine_on_campus = int(soup.find('div', attrs={'class': 'statistic-13893'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip())
+ #quarantine_off_campus = int(soup.find('div', attrs={'class': 'statistic-13896'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip())
+ #isolation_on_campus = int(soup.find('div', attrs={'class': 'statistic-13905'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip())
+ #isolation_off_campus = int(soup.find('div', attrs={'class': 'statistic-13908'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip())
+ #beds_available = int(soup.find('div', attrs={'class': 'statistic-13935'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip().strip('%'))
+ #tests_administered = int(soup.find('div', attrs={'class': 'statistic-13923'}).find_all("p", attrs={'class': 'card-header'})[0].text.strip().replace("*", " ").replace(",", ""))
+ #container = soup.find('div', attrs={'id': 'pandemic-message-container'})
+ #alert_level = container.find('a').text
+ color = ""
+ #if "Green" in alert_level:
+ # color = 'green'
+ #elif "Yellow" in alert_level:
+ # color = 'yellow'
+ #elif "Orange" in alert_level:
+ # color = 'orange'
+ #elif "Red" in alert_level:
+ # color = 'red'
+
+ #fall_data = None
+ #with open('history/fall-2020.json', 'r') as fd:
+ # fall_data = json.loads(fd.read())
+ current_data = Day(
+ last_updated=datetime.datetime.now(),
+ alert_level=color,
+ beds_available=-1,
+ isolation_off_campus=-1,
+ isolation_on_campus=-1,
+ new_staff=new_staff,
+ new_students=new_students,
+ quarantine_off_campus=-1,
+ quarantine_on_campus=-1,
+ tests_administered=-1,
+ total_staff=-1,
+ total_students=-1)
+ print(current_data.serialize())
+ try:
+ if not data_are_same(Day.get_all()[-1], current_data):
+ db.session.add(current_data)
+ except IndexError:
+ db.session.add(current_data)
+ dedup()
+ return current_data
+
+def dedup():
+ data = Day.get_all()
+ # get first date
+ starting_date = data[-1].serialize()['last_updated'].split(' ')[0]
+ for i in range(len(data)-2, 0, -1):
+ if data[i].serialize()['last_updated'].split(' ')[0] != starting_date:
+ starting_date = data[i].serialize()['last_updated'].split(' ')[0]
+ else:
+ db.session.delete(data[i])
+ print('dropped ' + data[i].serialize()['last_updated'])
+ db.session.commit()