updates for threading

2024-06-01 17:57:48 -06:00
5 changed files with 302 additions and 32 deletions
--- a/4
+++ b/4
@ -35,8 +35,8 @@ RUN pip install --no-cache-dir --user -r /tmp/requirements.txt
 COPY --chown=user:user profiles.default.toml /home/user/.prefect/profiles.toml
 # Copy the application files
-COPY --chown=user:user app ./app
+COPY --chown=user:user app ./
-COPY --chown=user:user noaa_animate.py .
+# COPY --chown=user:user noaa_animate.py .
 COPY --chown=user:user start.sh .
 COPY --chown=user:user init_db.py .
 RUN chmod +x start.sh
--- a/app/app.py
+++ b/app/app.py
@ -1,3 +1,5 @@
 import threading
 from flows import create_animations
 import logging
 import os
 import re
@ -68,7 +70,7 @@ def handle_api():
@app.route("/videos/<path:filename>")
 def custom_static(filename):
-    return send_from_directory("../out", filename)
+    return send_from_directory("./out", filename)
@app.route("/", methods=["GET"])
@ -150,4 +152,5 @@ def proxy(url="", proto=PROTO):
 if __name__ == "__main__":
    threading.Thread(target=create_animations.serve, daemon=True, kwargs={"name": "noaa-animate", "limit": 8}).start()
    app.run(host="0.0.0.0", port=9021, debug=True)
--- a/app/flows.py
+++ b/app/flows.py
@ -0,0 +1,264 @@
 import os
 import re
 from datetime import datetime, timedelta
 from io import BytesIO
 from typing import Dict, Iterator, List
 import httpx
 # import imageio
 import numpy as np
 from moviepy.editor import ImageSequenceClip
 from PIL import Image
 from prefect import flow, task
 from prefect.tasks import task_input_hash
 BASE_URL = "https://services.swpc.noaa.gov/images/animations/geospace/"
@task(
    retries=3,
    retry_delay_seconds=5,
    cache_key_fn=task_input_hash,
    cache_expiration=timedelta(minutes=2),
    log_prints=True,
 )
 def get_file_links(url: str, ext: str | None = None) -> Iterator[str]:
    response = httpx.get(url)
    response.raise_for_status()
    webpage_content = response.text
    if ext is None:
        print("Extension not supplied. Inferring (less efficient) png/jpg/jpeg")
        exts = ["png", "jpg", "jpeg"]
    else:
        exts = [ext.lower()]
    lines = webpage_content.split("\n")
    for line in lines:
        for ext in exts:
            if ext in line:  # need to parse the href link
                start_pos = line.find('href="') + len('href="')
                end_pos = line.find('"', start_pos)
                href = line[start_pos:end_pos]
                if href.endswith(f"latest.{ext}"):
                    print("Skipping latest")
                    continue
                if href.endswith(ext):
                    if not href.startswith("http"):
                        href = url + href
                    yield href
                break  # Exit the inner loop to avoid duplicate yields for multiple exts
 def url_tail_hash(context, parameters):
    # return a constant
    return parameters["url"].split("/")[-1]
 def out_path_hash(context, parameters):
    return parameters["output_path"] + f"_L{len(parameters['images'])}"
@task(
    retries=5,
    retry_delay_seconds=1,
    cache_key_fn=task_input_hash,
    cache_expiration=timedelta(minutes=5),
    result_storage_key="{parameters[url]}",
 )
 def get_content(url: str, params: Dict[str, any] | None = None):
    response = httpx.get(f"https://{url}", params=params)
    try:
        response.raise_for_status()
        return response.content
    except httpx.HTTPStatusError:
        return None
 def preview_urls(urls):
    print("URLS (head):")
    print(urls[:5])
    print("URLS (tail):")
    print(urls[-5:])
@task(
    cache_key_fn=task_input_hash,
    cache_expiration=timedelta(hours=1),
 )
 def get_images(urls: List[str] | List[str], limit: int = 0):
    if limit > 0:
        print(f"Limiting to {limit} urls")
        urls = urls[-limit:]
    urls = [url.replace("https://", "").replace("http://", "") for url in urls]
    preview_urls(urls)
    futures = get_content.map(urls)
    images = [
        (urls[i], f.result()) for i, f in enumerate(futures) if f.result() is not None
    ]
    return images
 def extract_timestamp_from_url(url: str) -> str:
    # Assuming the timestamp format is in the format shown in the screenshot
    match = re.search(r"\d{8}_\d{6}", url)
    return match.group(0) if match else ""
 # @task(
 #     cache_key_fn=out_path_hash,
 #     cache_expiration=timedelta(minutes=3),
 #     result_storage_key="{parameters[output_path]}",
 # )
 # def create_animation(
 #     images: List[bytes], output_path: str, duration: float = 0.5
 # ) -> None:
 #     if not images:
 #         raise ValueError("No images!")
 #     pil_images = [Image.open(BytesIO(img_data)).convert("RGB") for img_data in images]
 #     imageio.mimsave(output_path, pil_images, duration=duration)
 #     return output_path
 def make_even_dimensions(image):
    width, height = image.size
    if width % 2 == 1:
        width -= 1
    if height % 2 == 1:
        height -= 1
    return image.resize((width, height), Image.ANTIALIAS)
 def crop_to_even(image):
    width, height = image.size
    # Adjust width and height to be even
    if width % 2 == 1:
        width -= 1
    if height % 2 == 1:
        height -= 1
    return image.crop((0, 0, width, height))
@task(
    cache_key_fn=out_path_hash,
    cache_expiration=timedelta(hours=4),
    result_storage_key="{parameters[output_path]}",
 )
 def create_mp4_animation(images: List[bytes], output_path: str, fps: int = 24) -> None:
    # Convert bytes to PIL images and then to numpy arrays
    frames = [
        np.array(crop_to_even(Image.open(BytesIO(img_data)).convert("RGB")))
        for img_data in images
    ]
    # Create a video clip from the image sequence
    clip = ImageSequenceClip(frames, fps=fps)
    # Write the video clip to a file
    clip.write_videofile(
        output_path,
        codec="libx264",
        ffmpeg_params=["-pix_fmt", "yuv420p"],
        preset="medium",
        bitrate="800k",
    )
    return output_path
 def format_output_name(url: str, latest: bool = False):
    if latest:
        now = "latest"
    else:
        now = datetime.now().strftime("%Y%m%d-%H:%M:%S")
    return (
        url.replace("https://", "")
        .replace("http://", "")
        .replace("/", "-")
        .replace(".", "_")
        + now
    )
@task(
    name="animate",
    retries=0,
    retry_delay_seconds=1,
    log_prints=True,
    cache_key_fn=task_input_hash,
    cache_expiration=timedelta(minutes=3),
 )
 def animate(
    url: str = "https://services.swpc.noaa.gov/images/animations/geospace/density/",
    ext: str = "png",
    latest: bool = True,
    limit: int = 0,
 ):
    urls = get_file_links(url, ext)
    if len(urls) == 0:
        raise ValueError("No urls scraped")
    images = get_images(list(sorted(urls)), limit=limit)
    if len(images) == 0:
        raise ValueError("No images retrieved.")
    print(f"Retrieved {len(images)} images.")
    sorted_images = sorted(images, key=lambda x: extract_timestamp_from_url(x[0]))
    print("Head:")
    print([u for u, i in sorted_images[:5]])
    frames = [s[1] for s in sorted_images]
    # create_animation(frames, "out.gif", duration=5)
    out_name = format_output_name(url, latest=latest)
    create_mp4_animation(frames, f"out/{out_name}.mp4")
 def deploy_name():
    return datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%S") + "Z"
@flow(
    name="create-animations",
    retries=0,
    retry_delay_seconds=1,
    log_prints=True,
    flow_run_name=None,
    timeout_seconds=90,
 )
 def create_animations(
    url: str | List[str] = BASE_URL + "velocity/",
    ext: str | None = None,
    latest: bool = False,
    limit: int = 0,
 ):
    if isinstance(url, str):
        url = [url]
    futures = animate.map(url, ext, latest, limit)
    return futures
 if __name__ == "__main__":
    # make_animation.from_source(
    #     source=TEST_REPO,
    #     entrypoint="noaa_animate.py:make_animation",
    # ).deploy(
    #     name="noaa-animate", work_pool_name="process"
    # )
    from prefect.client.schemas.schedules import CronSchedule
    sched = CronSchedule(cron="*/15 * * * *", timezone="America/Denver")
    LINKS = [
        BASE_URL + "density/",
        BASE_URL + "velocity/",
        BASE_URL + "pressure/",
    ]
    sched_params = {
        "latest": True,
        "url": LINKS,
        "ext": "png",
        "limit": 0,
    }
    create_animations.serve(
        "noaa-animate", limit=8, schedule=None, parameters=sched_params
    )
    # make_animation(url)
--- a/init_db.py
+++ b/init_db.py
@ -16,34 +16,40 @@ def initialize_db(db_path):
        logging.info(f"{db_path} exists")  # Log a message if the database exists
        return
    try:
-        with sqlite3.connect(
+        # Using 'with' to ensure that the connection is closed automatically
-            db_path
+        with sqlite3.connect(db_path) as conn:
-        ) as conn:  # Using 'with' to ensure that the connection is closed automatically
+              configure_database(conn)
            configure_database(conn)
    except sqlite3.Error as e:
        logging.error(f"Database error: {e}")  # Log any SQLite errors that occur
    except Exception as e:
-        logging.error(
+        logging.error(f"Exception in initialize_db: {e}")
            f"Exception in initialize_db: {e}"
        )  # Log any other exceptions that occur
 def configure_database(conn):
    cursor = conn.cursor()
    # Setting the journal mode to WAL for better concurrency
    cursor.execute("PRAGMA journal_mode = WAL;")
    # Setting synchronous to NORMAL for a balance between speed and reliability
    cursor.execute("PRAGMA synchronous = NORMAL;")
    # Setting a busy timeout to prevent immediate failures when the database is locked
    cursor.execute("PRAGMA busy_timeout = 5000;")
    # Increasing the cache size to reduce the number of disk I/O operations
    cursor.execute("PRAGMA cache_size = -32000;")
    # Enabling memory-mapped I/O for potentially faster file operations
    cursor.execute("PRAGMA mmap_size = 536870912;")
    # Setting locking mode to EXCLUSIVE can enhance performance for single-user scenarios
    cursor.execute("PRAGMA locking_mode = EXCLUSIVE;")
    # Ensuring foreign key constraints are enforced for data integrity
    cursor.execute("PRAGMA foreign_keys = ON;")
    # disable legacy alter table behavior as it will cause problems during
    # migrations when tables are renamed as references would otherwise be retained
    # in some locations
    # https://www.sqlite.org/pragma.html#pragma_legacy_alter_table
    cursor.execute("PRAGMA legacy_alter_table=OFF")
    # when using the WAL, we do need to sync changes on every write. sqlite
    # recommends using 'normal' mode which is much faster
    # Setting synchronous to NORMAL for a balance between speed and reliability
    cursor.execute("PRAGMA synchronous = NORMAL;")
    # Increasing the cache size to reduce the number of disk I/O operations
    cursor.execute("PRAGMA cache_size = 20000;")
    # Enabling memory-mapped I/O for potentially faster file operations
    cursor.execute("PRAGMA mmap_size = 536870912;")
    # Setting a busy timeout to prevent immediate failures when the database is locked
    # setting the value very high allows for more 'concurrency'
    # without running into errors, but may result in slow api calls
    cursor.execute("PRAGMA busy_timeout = 60000;")
    # Setting locking mode to EXCLUSIVE can enhance performance for single-user scenarios
    # cursor.execute("PRAGMA locking_mode = EXCLUSIVE;")
    conn.commit()  # Commit all PRAGMA configurations
    logging.info("Set up database with multi-user optimizations.")
@ -55,13 +61,11 @@ def batch_transact(db_path, operations):
            db_path
        ) as conn:  # Ensure that the connection is handled properly
            cursor = conn.cursor()
-            cursor.execute(
+            # Start a transaction for batch operations
-                "BEGIN TRANSACTION;"
+            cursor.execute("BEGIN TRANSACTION;")  
            )  # Start a transaction for batch operations
            for operation in operations:
-                cursor.execute(
+                # Execute each SQL operation provided in the operations list
-                    operation
+                cursor.execute(operation)  
                )  # Execute each SQL operation provided in the operations list
            cursor.execute("COMMIT;")  # Commit all operations at once
    except sqlite3.Error as e:
        logging.error(f"Database error during batch transaction: {e}")
@ -73,9 +77,8 @@ def maintenance(db_path):
    try:
        with sqlite3.connect(db_path) as conn:
            cursor = conn.cursor()
-            cursor.execute(
+            # Optimize the database to maintain performance
-                "PRAGMA optimize;"
+            cursor.execute("PRAGMA optimize;")
            )  # Optimize the database to maintain performance
            cursor.execute("VACUUM;")  # Reclaim space and defragment the database file
    except sqlite3.Error as e:
        logging.error(f"Database error during maintenance: {e}")
--- a/start.sh
+++ b/start.sh
@ -1,14 +1,14 @@
 #!/bin/bash
 # Start the web app
 cd app && make &
 # Start Prefect in the background
 prefect server start --host 0.0.0.0 &
 sleep 10
 # Start the web app
 make &
 # Start the deployment
-python noaa_animate.py &
+python flows.py &
 # Wait for all background jobs to finish
 wait