Make file downloads a bit more generic and cooler

author: Galen Guyer <galen@galenguyer.com> 2022-11-09 18:01:25 -0500
committer: Galen Guyer <galen@galenguyer.com> 2022-11-09 18:02:25 -0500
commit: a0bd6c94dcced0bb43e7247016cb89fcc72cf834 (patch)
tree: 98e4bf98c0eb7385b7cf024f7ad177ffd4141f57
parent: 0e961e3f322de14cab462c8258c8f615e8744383 (diff)
3 files changed, 89 insertions, 18 deletions
diff --git a/.gitignore b/.gitignore
index 66ab5f8..ca69dd3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,12 @@
+# Compiled binaries and stuff
 /target
 
+# FCC files
 *.zip
 *.dat
 counts
+*.txt
+
+# Generated files
 *.db
 *.db.zst
diff --git a/README.md b/README.md
index 35355ac..899dae1 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,11 @@
 
 The FCC publishes weekly full dumps and daily updates to their license database. The format is awful and the FCC search page is so slow. This dumps it into SQL so we can be speedy
 
+## Usage
+Before running the program, you must create your database. Run `sqlite3 fcc.db < migrations/01-create-db.sql` to do so.
+
+Once this is done, you can run artemis. Note that there's no checking of if a key exists yet or not. It'll simply yolo it, so if you run it multiple times you'll get multiple copies. This will be fixed Eventually.
+
 ## References
 All documentation for the FCC ULS database dumps can be found at [www.fcc.gov/wireless/data/public-access-files-database-downloads](https://www.fcc.gov/wireless/data/public-access-files-database-downloads)
 
diff --git a/src/main.rs b/src/main.rs
index 8ad0da8..c5efab0 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -10,11 +10,22 @@ mod load;
 mod types;
 
 const WEEKLY_DUMP_URL: &str = "https://data.fcc.gov/download/pub/uls/complete/l_amat.zip";
-
-fn download_file() -> Result<File, ()> {
-    let resp = ureq::get(WEEKLY_DUMP_URL)
-        .call()
-        .expect("Error downloading file");
+const SPECIAL_CONDITIONS_URL: &str = "https://www.fcc.gov/file/20669/download";
+
+/// Downloads a file from the given URL to the given path
+///
+/// # Arguments
+///
+/// * `url` - The URL to download from
+/// * `path` - The path to download to. If None, try and use the Content-Disposition
+/// header to determine the filename, and fall back to the last segment of the URL
+///
+/// # Examples
+/// ```
+/// download_file("https://data.fcc.gov/download/pub/uls/complete/l_amat.zip", None);
+/// ```
+fn download_file(url: &str, file_name: Option<&str>) -> Result<File, ()> {
+    let resp = ureq::get(url).call().expect("Error downloading file");
 
     // We can work on handling not having a Content-Length header later
     assert!(resp.has("Content-Length"));
@@ -24,20 +35,67 @@ fn download_file() -> Result<File, ()> {
         .parse()
         .expect("Error parsing Content-Length header");
 
-    let last_modified = DateTime::parse_from_rfc2822(
-        resp.header("Last-Modified")
-            .expect("Error getting Last-Modified header"),
-    )
-    .expect("Error parsing Last-Modified header")
-    .timestamp();
-
-    let output_file_name = "l_amat.zip";
+    let last_modified = match resp.header("Last-Modified") {
+        Some(last_mod) => {
+            match DateTime::parse_from_rfc2822(last_mod) {
+                Ok(dt) => Some(dt.timestamp()),
+                Err(_) => None,
+            }
+        }
+        None => {
+            None
+        }
+    };
+
+    // Time to determine the file name!
+    // Start by seeing if we were told anything, that makes it easy
+    // This is just a helper. It should be its own function. lmao.
+    let parse_file_name_from_url = |url: &str| {
+        let output_file_name_regex = Regex::new(r"/(\w+\.?\w*)").expect("Error constructing regex");
+        let Some(file_name_captures) = output_file_name_regex.captures_iter(url).last() else {
+            panic!("Error parsing file name from URL");
+        };
+        let Some(maybe_match) = file_name_captures.iter().last() else {
+            panic!("Error parsing file name from URL");
+        };
+        let Some(file_name_match) = maybe_match else {
+            panic!("Error parsing file name from URL");
+        };
+        String::from(file_name_match.as_str())
+    };
+    let output_file_name = match file_name {
+        Some(n) => String::from(n),
+        None => {
+            // We weren't given a file name by the user, so we need to figure it out ourself
+            match resp.header("Content-Disposition") {
+                // A Content-Disposition header is present, so we can use that
+                Some(content_disposition) => {
+                    let content_disposition_regex =
+                        Regex::new(r#"filename="([\w\.]+)""#).expect("Error compiling regex");
+                    // Check if the Content-Disposition header specifies a filename
+                    match content_disposition_regex.captures(content_disposition) {
+                        Some(cd_match) => {
+                            // We have a filename, so use that
+                            // TODO: Make less unwrappy
+                            cd_match.iter().last().unwrap().unwrap().as_str().to_string()
+                        }
+                        None => {
+                            // It doesn't, so we have to fall back to the file name in the URL
+                            parse_file_name_from_url(url)
+                        }
+                    }
+                }
+                // No Content-Disposition header, so we have to fall back to the file name in the URL
+                None => parse_file_name_from_url(url),
+            }
+        }
+    };
 
-    if std::path::Path::new(output_file_name).exists() {
-        let file_metadata = fs::metadata(output_file_name).expect("Error getting file metadata");
+    if std::path::Path::new(&output_file_name).exists() {
+        let file_metadata = fs::metadata(&output_file_name).expect("Error getting file metadata");
         let mtime = FileTime::from_last_modification_time(&file_metadata);
 
-        match (mtime.seconds() >= last_modified, file_metadata.len() == len) {
+        match (mtime.seconds() >= last_modified.unwrap_or(1), file_metadata.len() == len) {
             (true, true) => {
                 println!("File already downloaded");
                 return Ok(File::open(output_file_name).expect("Error opening file"));
@@ -53,7 +111,7 @@ fn download_file() -> Result<File, ()> {
         println!("File does not exist, downloading");
     }
 
-    let mut output_file = fs::File::create(output_file_name).expect("Error creating output file");
+    let mut output_file = fs::File::create(&output_file_name).expect("Error creating output file");
 
     let mut reader = resp.into_reader();
     let chunk_size = len / 99;
@@ -143,7 +201,10 @@ fn unzip_file(zip_file: File) -> Result<(), ()> {
 
 #[tokio::main]
 async fn main() {
-    let output_file = download_file().expect("Error downloading file");
+    let output_file = download_file(WEEKLY_DUMP_URL, None).expect("Error downloading weekly dump file");
+
+    #[allow(unused_variables)]
+    let conditions_file = download_file(SPECIAL_CONDITIONS_URL, None).expect("Error downloading Special Conditions file");
 
     unzip_file(output_file).expect("Error unzipping file");
author	Galen Guyer <galen@galenguyer.com>	2022-11-09 18:01:25 -0500
committer	Galen Guyer <galen@galenguyer.com>	2022-11-09 18:02:25 -0500
commit	a0bd6c94dcced0bb43e7247016cb89fcc72cf834 (patch)
tree	98e4bf98c0eb7385b7cf024f7ad177ffd4141f57
parent	0e961e3f322de14cab462c8258c8f615e8744383 (diff)