Enhance configuration and processing with DeerMapper API integration, adjustable parameters, and improved error handling
This commit is contained in:
19
config.yaml
19
config.yaml
@@ -8,6 +8,10 @@ s3:
|
|||||||
postgres:
|
postgres:
|
||||||
dsn: "postgresql://postgres:DFGk5s9H21lKao2K@136.243.41.58:7777/meles"
|
dsn: "postgresql://postgres:DFGk5s9H21lKao2K@136.243.41.58:7777/meles"
|
||||||
|
|
||||||
|
deermapper-api:
|
||||||
|
base_url: "https://webapp.deermapper.net/api/icu"
|
||||||
|
apiKey: "695bc217-3b40-48bb-bb12-17fc5b08b320"
|
||||||
|
|
||||||
app:
|
app:
|
||||||
entrance_prefix: "icu/entrance/"
|
entrance_prefix: "icu/entrance/"
|
||||||
processed_prefix: "icu/processed/"
|
processed_prefix: "icu/processed/"
|
||||||
@@ -15,8 +19,17 @@ app:
|
|||||||
min_age_seconds: 90
|
min_age_seconds: 90
|
||||||
poll_seconds: 30
|
poll_seconds: 30
|
||||||
# OCR crop tunables (defaults are usually fine)
|
# OCR crop tunables (defaults are usually fine)
|
||||||
ocr_crop_w_frac: 0.42
|
ocr_crop_w_frac: 0.63
|
||||||
ocr_crop_h_frac: 0.22
|
ocr_crop_h_frac: 0.05
|
||||||
thumb_max: 512
|
thumb_max: 512
|
||||||
exiftool_timeout_seconds: 15
|
exiftool_timeout_seconds: 15
|
||||||
job_sleep_seconds: 1.0
|
job_sleep_seconds: 0.0
|
||||||
|
parallel_workers: 6
|
||||||
|
s3_max_pool_connections: 48
|
||||||
|
# Optional processing switches:
|
||||||
|
# - false => image still gets moved + thumbnail, and backfill flag is set in import_job
|
||||||
|
enable_ocr: true
|
||||||
|
enable_exif: true
|
||||||
|
enable_deermapper_api: false
|
||||||
|
deermapper_api_timeout_seconds: 20
|
||||||
|
deermapper_api_image_field: "image"
|
||||||
|
|||||||
300
main.py
300
main.py
@@ -6,12 +6,17 @@ import json
|
|||||||
import time
|
import time
|
||||||
import tempfile
|
import tempfile
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import uuid as uuidlib
|
||||||
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from typing import Optional, Dict, Tuple
|
from typing import Optional, Dict, Tuple
|
||||||
|
from urllib import error as urlerror
|
||||||
|
from urllib import request as urlrequest
|
||||||
|
|
||||||
import boto3
|
import boto3
|
||||||
import psycopg
|
import psycopg
|
||||||
|
from botocore.config import Config as BotoClientConfig
|
||||||
from dateutil import tz
|
from dateutil import tz
|
||||||
from PIL import Image, ImageOps, ImageEnhance
|
from PIL import Image, ImageOps, ImageEnhance
|
||||||
import pytesseract
|
import pytesseract
|
||||||
@@ -65,7 +70,16 @@ class AppConfig:
|
|||||||
|
|
||||||
thumb_max: int = 512
|
thumb_max: int = 512
|
||||||
exiftool_timeout_seconds: int = 15
|
exiftool_timeout_seconds: int = 15
|
||||||
job_sleep_seconds: float = 1.0
|
job_sleep_seconds: float = 0.0
|
||||||
|
parallel_workers: int = 4
|
||||||
|
s3_max_pool_connections: int = 32
|
||||||
|
enable_ocr: bool = True
|
||||||
|
enable_exif: bool = True
|
||||||
|
enable_deermapper_api: bool = False
|
||||||
|
deermapper_api_base_url: str = "https://webapp.deermapper.net/api/icu"
|
||||||
|
deermapper_api_key: str = ""
|
||||||
|
deermapper_api_timeout_seconds: int = 20
|
||||||
|
deermapper_api_image_field: str = "image"
|
||||||
|
|
||||||
|
|
||||||
def load_config(path: str) -> AppConfig:
|
def load_config(path: str) -> AppConfig:
|
||||||
@@ -76,9 +90,26 @@ def load_config(path: str) -> AppConfig:
|
|||||||
def env_or(key: str, default=None):
|
def env_or(key: str, default=None):
|
||||||
return os.environ.get(key, default)
|
return os.environ.get(key, default)
|
||||||
|
|
||||||
|
def as_bool(value, default: bool = True) -> bool:
|
||||||
|
if value is None:
|
||||||
|
return default
|
||||||
|
if isinstance(value, bool):
|
||||||
|
return value
|
||||||
|
if isinstance(value, (int, float)):
|
||||||
|
return bool(value)
|
||||||
|
if isinstance(value, str):
|
||||||
|
return value.strip().lower() in ("1", "true", "yes", "on")
|
||||||
|
return bool(value)
|
||||||
|
|
||||||
s3 = raw["s3"]
|
s3 = raw["s3"]
|
||||||
pg = raw["postgres"]
|
pg = raw["postgres"]
|
||||||
app = raw.get("app", {})
|
app = raw.get("app", {})
|
||||||
|
deermapper = raw.get("deermapper-api", raw.get("deermapper_api", {}))
|
||||||
|
default_workers = max(1, min(16, (os.cpu_count() or 2) * 2))
|
||||||
|
parallel_workers = int(app.get("parallel_workers", default_workers))
|
||||||
|
parallel_workers = max(1, parallel_workers)
|
||||||
|
s3_pool = int(app.get("s3_max_pool_connections", max(16, parallel_workers * 4)))
|
||||||
|
s3_pool = max(10, s3_pool)
|
||||||
|
|
||||||
return AppConfig(
|
return AppConfig(
|
||||||
s3_endpoint=env_or("S3_ENDPOINT", s3["endpoint"]),
|
s3_endpoint=env_or("S3_ENDPOINT", s3["endpoint"]),
|
||||||
@@ -95,7 +126,16 @@ def load_config(path: str) -> AppConfig:
|
|||||||
ocr_crop_h_frac=float(app.get("ocr_crop_h_frac", 0.22)),
|
ocr_crop_h_frac=float(app.get("ocr_crop_h_frac", 0.22)),
|
||||||
thumb_max=int(app.get("thumb_max", 512)),
|
thumb_max=int(app.get("thumb_max", 512)),
|
||||||
exiftool_timeout_seconds=int(app.get("exiftool_timeout_seconds", 15)),
|
exiftool_timeout_seconds=int(app.get("exiftool_timeout_seconds", 15)),
|
||||||
job_sleep_seconds=float(app.get("job_sleep_seconds", 1.0)),
|
job_sleep_seconds=float(app.get("job_sleep_seconds", 0.0)),
|
||||||
|
parallel_workers=parallel_workers,
|
||||||
|
s3_max_pool_connections=s3_pool,
|
||||||
|
enable_ocr=as_bool(app.get("enable_ocr", True), True),
|
||||||
|
enable_exif=as_bool(app.get("enable_exif", True), True),
|
||||||
|
enable_deermapper_api=as_bool(app.get("enable_deermapper_api", False), False),
|
||||||
|
deermapper_api_base_url=env_or("DEERMAPPER_API_URL", deermapper.get("base_url", "https://webapp.deermapper.net/api/icu")),
|
||||||
|
deermapper_api_key=env_or("DEERMAPPER_API_KEY", deermapper.get("apiKey", "")),
|
||||||
|
deermapper_api_timeout_seconds=int(app.get("deermapper_api_timeout_seconds", 20)),
|
||||||
|
deermapper_api_image_field=app.get("deermapper_api_image_field", "image"),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -256,6 +296,71 @@ def make_thumbnail_bytes(jpg_bytes: bytes, thumb_max: int) -> bytes:
|
|||||||
return out.getvalue()
|
return out.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
def _multipart_form_data(fields: Dict[str, str], files: list[tuple[str, str, bytes, str]]) -> tuple[str, bytes]:
|
||||||
|
boundary = f"----meles-{uuidlib.uuid4().hex}"
|
||||||
|
boundary_bytes = boundary.encode("ascii")
|
||||||
|
crlf = b"\r\n"
|
||||||
|
|
||||||
|
parts: list[bytes] = []
|
||||||
|
for name, value in fields.items():
|
||||||
|
parts.extend([
|
||||||
|
b"--" + boundary_bytes,
|
||||||
|
f'Content-Disposition: form-data; name="{name}"'.encode("utf-8"),
|
||||||
|
b"",
|
||||||
|
str(value).encode("utf-8"),
|
||||||
|
])
|
||||||
|
for field_name, filename, content, content_type in files:
|
||||||
|
parts.extend([
|
||||||
|
b"--" + boundary_bytes,
|
||||||
|
f'Content-Disposition: form-data; name="{field_name}"; filename="{filename}"'.encode("utf-8"),
|
||||||
|
f"Content-Type: {content_type}".encode("utf-8"),
|
||||||
|
b"",
|
||||||
|
content,
|
||||||
|
])
|
||||||
|
parts.append(b"--" + boundary_bytes + b"--")
|
||||||
|
body = crlf.join(parts) + crlf
|
||||||
|
return f"multipart/form-data; boundary={boundary}", body
|
||||||
|
|
||||||
|
|
||||||
|
def push_to_deermapper_api(cfg: AppConfig, image_uuid: str, imei: str, metadata: dict, jpg_bytes: bytes) -> tuple[bool, str]:
|
||||||
|
fields = {
|
||||||
|
"apiKey": cfg.deermapper_api_key,
|
||||||
|
"imei": imei,
|
||||||
|
"metadata": json.dumps(metadata, ensure_ascii=False),
|
||||||
|
}
|
||||||
|
content_type, body = _multipart_form_data(
|
||||||
|
fields,
|
||||||
|
[(cfg.deermapper_api_image_field, f"{image_uuid}.jpg", jpg_bytes, "image/jpeg")],
|
||||||
|
)
|
||||||
|
req = urlrequest.Request(
|
||||||
|
cfg.deermapper_api_base_url,
|
||||||
|
data=body,
|
||||||
|
method="POST",
|
||||||
|
headers={
|
||||||
|
"Content-Type": content_type,
|
||||||
|
"Accept": "application/json",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
with urlrequest.urlopen(req, timeout=cfg.deermapper_api_timeout_seconds) as resp:
|
||||||
|
payload = resp.read().decode("utf-8", errors="replace")
|
||||||
|
try:
|
||||||
|
data = json.loads(payload) if payload else {}
|
||||||
|
except Exception:
|
||||||
|
return False, f"API response not JSON: {payload[:300]}"
|
||||||
|
status_val = str(data.get("status", "")).strip()
|
||||||
|
msg = str(data.get("msg", "")).strip()
|
||||||
|
if status_val == "1":
|
||||||
|
return True, msg or "ok"
|
||||||
|
return False, msg or f"status={status_val or 'unknown'}"
|
||||||
|
except urlerror.HTTPError as e:
|
||||||
|
body_txt = e.read().decode("utf-8", errors="replace")
|
||||||
|
return False, f"HTTP {e.code}: {body_txt[:300]}"
|
||||||
|
except Exception as e:
|
||||||
|
return False, str(e)
|
||||||
|
|
||||||
|
|
||||||
# -----------------------
|
# -----------------------
|
||||||
# DB ops
|
# DB ops
|
||||||
# -----------------------
|
# -----------------------
|
||||||
@@ -272,10 +377,22 @@ def db_init(cur):
|
|||||||
entrance_cat_key text,
|
entrance_cat_key text,
|
||||||
processed_jpg_key text,
|
processed_jpg_key text,
|
||||||
thumbnail_key text,
|
thumbnail_key text,
|
||||||
|
needs_ocr_backfill boolean NOT NULL DEFAULT false,
|
||||||
|
needs_exif_backfill boolean NOT NULL DEFAULT false,
|
||||||
created_ts timestamptz NOT NULL DEFAULT now(),
|
created_ts timestamptz NOT NULL DEFAULT now(),
|
||||||
updated_ts timestamptz NOT NULL DEFAULT now()
|
updated_ts timestamptz NOT NULL DEFAULT now()
|
||||||
);
|
);
|
||||||
""")
|
""")
|
||||||
|
cur.execute("""
|
||||||
|
ALTER TABLE remote_cam.import_job
|
||||||
|
ADD COLUMN IF NOT EXISTS needs_ocr_backfill boolean NOT NULL DEFAULT false;
|
||||||
|
""")
|
||||||
|
cur.execute("""
|
||||||
|
ALTER TABLE remote_cam.import_job
|
||||||
|
ADD COLUMN IF NOT EXISTS needs_exif_backfill boolean NOT NULL DEFAULT false;
|
||||||
|
""")
|
||||||
|
cur.execute("CREATE INDEX IF NOT EXISTS import_job_needs_ocr_idx ON remote_cam.import_job(needs_ocr_backfill);")
|
||||||
|
cur.execute("CREATE INDEX IF NOT EXISTS import_job_needs_exif_idx ON remote_cam.import_job(needs_exif_backfill);")
|
||||||
cur.execute("CREATE INDEX IF NOT EXISTS import_job_status_idx ON remote_cam.import_job(status);")
|
cur.execute("CREATE INDEX IF NOT EXISTS import_job_status_idx ON remote_cam.import_job(status);")
|
||||||
cur.execute("CREATE INDEX IF NOT EXISTS import_job_updated_idx ON remote_cam.import_job(updated_ts);")
|
cur.execute("CREATE INDEX IF NOT EXISTS import_job_updated_idx ON remote_cam.import_job(updated_ts);")
|
||||||
|
|
||||||
@@ -298,6 +415,14 @@ def db_init(cur):
|
|||||||
END IF;
|
END IF;
|
||||||
END$$;
|
END$$;
|
||||||
""")
|
""")
|
||||||
|
cur.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS remote_cam.api_response (
|
||||||
|
push_ts timestamp without time zone,
|
||||||
|
message character(512),
|
||||||
|
image_uuid uuid,
|
||||||
|
success boolean
|
||||||
|
);
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
def job_upsert_new(cur, uuid: str, jpg_key: str, meta_key: str, cat_key: Optional[str]):
|
def job_upsert_new(cur, uuid: str, jpg_key: str, meta_key: str, cat_key: Optional[str]):
|
||||||
@@ -324,9 +449,20 @@ def job_set_status(cur, uuid: str, status: str, *, err: Optional[str] = None, pr
|
|||||||
""", (status, status, err, processed_key, thumb_key, uuid))
|
""", (status, status, err, processed_key, thumb_key, uuid))
|
||||||
|
|
||||||
|
|
||||||
|
def job_set_backfill(cur, uuid: str, *, needs_ocr_backfill: Optional[bool] = None, needs_exif_backfill: Optional[bool] = None):
|
||||||
|
if needs_ocr_backfill is None and needs_exif_backfill is None:
|
||||||
|
return
|
||||||
|
cur.execute("""
|
||||||
|
UPDATE remote_cam.import_job
|
||||||
|
SET needs_ocr_backfill = COALESCE(%s, needs_ocr_backfill),
|
||||||
|
needs_exif_backfill = COALESCE(%s, needs_exif_backfill)
|
||||||
|
WHERE image_uuid = %s;
|
||||||
|
""", (needs_ocr_backfill, needs_exif_backfill, uuid))
|
||||||
|
|
||||||
|
|
||||||
def job_get(cur, uuid: str):
|
def job_get(cur, uuid: str):
|
||||||
cur.execute("""
|
cur.execute("""
|
||||||
SELECT status, processed_jpg_key, thumbnail_key, has_categories
|
SELECT status, processed_jpg_key, thumbnail_key, has_categories, needs_ocr_backfill, needs_exif_backfill
|
||||||
FROM remote_cam.import_job
|
FROM remote_cam.import_job
|
||||||
WHERE image_uuid = %s;
|
WHERE image_uuid = %s;
|
||||||
""", (uuid,))
|
""", (uuid,))
|
||||||
@@ -338,6 +474,8 @@ def job_get(cur, uuid: str):
|
|||||||
"processed_jpg_key": row[1],
|
"processed_jpg_key": row[1],
|
||||||
"thumbnail_key": row[2],
|
"thumbnail_key": row[2],
|
||||||
"has_categories": row[3],
|
"has_categories": row[3],
|
||||||
|
"needs_ocr_backfill": row[4],
|
||||||
|
"needs_exif_backfill": row[5],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -345,10 +483,14 @@ STATUS_ORDER = [
|
|||||||
"NEW",
|
"NEW",
|
||||||
"META_OK",
|
"META_OK",
|
||||||
"OCR_OK",
|
"OCR_OK",
|
||||||
|
"OCR_SKIPPED",
|
||||||
"EXIF_OK",
|
"EXIF_OK",
|
||||||
|
"EXIF_SKIPPED",
|
||||||
"MOVED",
|
"MOVED",
|
||||||
"THUMB_OK",
|
"THUMB_OK",
|
||||||
"CATEGORIES_OK",
|
"CATEGORIES_OK",
|
||||||
|
"API_FAIL",
|
||||||
|
"API_OK",
|
||||||
"CLEANED",
|
"CLEANED",
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -359,6 +501,16 @@ def status_rank(status: Optional[str]) -> int:
|
|||||||
return -1
|
return -1
|
||||||
|
|
||||||
|
|
||||||
|
def is_terminal_success(status: Optional[str], cfg: AppConfig) -> bool:
|
||||||
|
if status is None:
|
||||||
|
return False
|
||||||
|
if status == "CLEANED":
|
||||||
|
return True
|
||||||
|
if cfg.enable_deermapper_api:
|
||||||
|
return status_rank(status) >= status_rank("API_OK")
|
||||||
|
return status_rank(status) >= status_rank("THUMB_OK")
|
||||||
|
|
||||||
|
|
||||||
def insert_resource(cur, uuid: str, typ: str):
|
def insert_resource(cur, uuid: str, typ: str):
|
||||||
cur.execute("""
|
cur.execute("""
|
||||||
INSERT INTO remote_cam.resource(resource_uuid, type, import_ts)
|
INSERT INTO remote_cam.resource(resource_uuid, type, import_ts)
|
||||||
@@ -484,6 +636,14 @@ def insert_categories(cur, uuid: str, cat_json: dict):
|
|||||||
""", (uuid, c.get("category"), c.get("score"), version))
|
""", (uuid, c.get("category"), c.get("score"), version))
|
||||||
|
|
||||||
|
|
||||||
|
def insert_api_response(cur, uuid: str, success: bool, message: str):
|
||||||
|
msg = (message or "")[:512]
|
||||||
|
cur.execute("""
|
||||||
|
INSERT INTO remote_cam.api_response(push_ts, message, image_uuid, success)
|
||||||
|
VALUES (now(), %s, %s::uuid, %s);
|
||||||
|
""", (msg, uuid, success))
|
||||||
|
|
||||||
|
|
||||||
# -----------------------
|
# -----------------------
|
||||||
# S3 ops wrapper
|
# S3 ops wrapper
|
||||||
# -----------------------
|
# -----------------------
|
||||||
@@ -495,6 +655,7 @@ class S3:
|
|||||||
endpoint_url=cfg.s3_endpoint,
|
endpoint_url=cfg.s3_endpoint,
|
||||||
aws_access_key_id=cfg.s3_access_key,
|
aws_access_key_id=cfg.s3_access_key,
|
||||||
aws_secret_access_key=cfg.s3_secret_key,
|
aws_secret_access_key=cfg.s3_secret_key,
|
||||||
|
config=BotoClientConfig(max_pool_connections=cfg.s3_max_pool_connections),
|
||||||
)
|
)
|
||||||
|
|
||||||
def list_entrance_objects(self):
|
def list_entrance_objects(self):
|
||||||
@@ -513,13 +674,21 @@ class S3:
|
|||||||
def head(self, key: str):
|
def head(self, key: str):
|
||||||
return self.client.head_object(Bucket=self.cfg.s3_bucket, Key=key)
|
return self.client.head_object(Bucket=self.cfg.s3_bucket, Key=key)
|
||||||
|
|
||||||
def delete_keys(self, keys):
|
def delete_keys(self, keys) -> int:
|
||||||
if not keys:
|
if not keys:
|
||||||
return
|
return 0
|
||||||
self.client.delete_objects(
|
resp = self.client.delete_objects(
|
||||||
Bucket=self.cfg.s3_bucket,
|
Bucket=self.cfg.s3_bucket,
|
||||||
Delete={"Objects": [{"Key": k} for k in keys], "Quiet": True},
|
Delete={"Objects": [{"Key": k} for k in keys], "Quiet": False},
|
||||||
)
|
)
|
||||||
|
errors = resp.get("Errors") or []
|
||||||
|
if errors:
|
||||||
|
details = "; ".join(
|
||||||
|
f"{e.get('Key')}:{e.get('Code')}:{e.get('Message')}" for e in errors[:3]
|
||||||
|
)
|
||||||
|
raise RuntimeError(f"S3 delete failed for {len(errors)} object(s): {details}")
|
||||||
|
deleted = resp.get("Deleted") or []
|
||||||
|
return len(deleted)
|
||||||
|
|
||||||
|
|
||||||
# -----------------------
|
# -----------------------
|
||||||
@@ -590,27 +759,42 @@ def process_one(cur, s3: S3, cfg: AppConfig, s: EntranceSet):
|
|||||||
|
|
||||||
# 2) JPG -> OCR bottom-right -> ocr_ts in CustomerTimezone
|
# 2) JPG -> OCR bottom-right -> ocr_ts in CustomerTimezone
|
||||||
jpg_bytes = s3.get_bytes(s.jpg_key)
|
jpg_bytes = s3.get_bytes(s.jpg_key)
|
||||||
ocr_ts = None
|
ocr_ts = get_ocr_ts(cur, s.uuid)
|
||||||
if status_rank(status_now) < status_rank("OCR_OK"):
|
ocr_needs_run = cfg.enable_ocr and (
|
||||||
|
status_rank(status_now) < status_rank("OCR_OK") or status_now == "OCR_SKIPPED"
|
||||||
|
)
|
||||||
|
|
||||||
|
if ocr_needs_run:
|
||||||
try:
|
try:
|
||||||
customer_tzinfo, _ = parse_customer_tz(meta)
|
customer_tzinfo, _ = parse_customer_tz(meta)
|
||||||
ocr_ts = ocr_extract_timestamp(jpg_bytes, cfg.ocr_crop_w_frac, cfg.ocr_crop_h_frac, customer_tzinfo)
|
ocr_ts = ocr_extract_timestamp(jpg_bytes, cfg.ocr_crop_w_frac, cfg.ocr_crop_h_frac, customer_tzinfo)
|
||||||
update_ocr_ts(cur, s.uuid, ocr_ts)
|
update_ocr_ts(cur, s.uuid, ocr_ts)
|
||||||
insert_resource(cur, s.uuid, "image")
|
insert_resource(cur, s.uuid, "image")
|
||||||
|
job_set_backfill(cur, s.uuid, needs_ocr_backfill=False)
|
||||||
job_set_status(cur, s.uuid, "OCR_OK")
|
job_set_status(cur, s.uuid, "OCR_OK")
|
||||||
status_now = "OCR_OK"
|
status_now = "OCR_OK"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
job_set_backfill(cur, s.uuid, needs_ocr_backfill=True)
|
||||||
job_set_status(cur, s.uuid, "OCR_FAIL", err=str(e))
|
job_set_status(cur, s.uuid, "OCR_FAIL", err=str(e))
|
||||||
status_now = "OCR_FAIL"
|
status_now = "OCR_FAIL"
|
||||||
elif status_rank(status_now) >= status_rank("OCR_OK"):
|
elif not cfg.enable_ocr:
|
||||||
ocr_ts = get_ocr_ts(cur, s.uuid)
|
job_set_backfill(cur, s.uuid, needs_ocr_backfill=True)
|
||||||
|
if status_rank(status_now) < status_rank("OCR_SKIPPED"):
|
||||||
|
job_set_status(cur, s.uuid, "OCR_SKIPPED")
|
||||||
|
status_now = "OCR_SKIPPED"
|
||||||
|
|
||||||
# 3) EXIF write:
|
# 3) EXIF write:
|
||||||
# DateTimeOriginal = ocr_ts (local, keep offset tags)
|
# DateTimeOriginal = ocr_ts (local, keep offset tags)
|
||||||
# DateTimeDigitized = servertime (UTC)
|
# DateTimeDigitized = servertime (UTC)
|
||||||
jpg_exif = jpg_bytes
|
jpg_exif = jpg_bytes
|
||||||
if status_rank(status_now) < status_rank("MOVED"):
|
if not cfg.enable_exif:
|
||||||
|
job_set_backfill(cur, s.uuid, needs_exif_backfill=True)
|
||||||
|
if status_rank(status_now) < status_rank("EXIF_SKIPPED"):
|
||||||
|
job_set_status(cur, s.uuid, "EXIF_SKIPPED")
|
||||||
|
status_now = "EXIF_SKIPPED"
|
||||||
|
elif status_rank(status_now) < status_rank("MOVED"):
|
||||||
if ocr_ts is None:
|
if ocr_ts is None:
|
||||||
|
job_set_backfill(cur, s.uuid, needs_exif_backfill=True)
|
||||||
job_set_status(cur, s.uuid, "EXIF_SKIPPED", err="OCR missing")
|
job_set_status(cur, s.uuid, "EXIF_SKIPPED", err="OCR missing")
|
||||||
status_now = "EXIF_SKIPPED"
|
status_now = "EXIF_SKIPPED"
|
||||||
else:
|
else:
|
||||||
@@ -622,9 +806,11 @@ def process_one(cur, s3: S3, cfg: AppConfig, s: EntranceSet):
|
|||||||
servertime,
|
servertime,
|
||||||
timeout_seconds=cfg.exiftool_timeout_seconds,
|
timeout_seconds=cfg.exiftool_timeout_seconds,
|
||||||
)
|
)
|
||||||
|
job_set_backfill(cur, s.uuid, needs_exif_backfill=False)
|
||||||
job_set_status(cur, s.uuid, "EXIF_OK")
|
job_set_status(cur, s.uuid, "EXIF_OK")
|
||||||
status_now = "EXIF_OK"
|
status_now = "EXIF_OK"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
job_set_backfill(cur, s.uuid, needs_exif_backfill=True)
|
||||||
job_set_status(cur, s.uuid, "EXIF_FAIL", err=str(e))
|
job_set_status(cur, s.uuid, "EXIF_FAIL", err=str(e))
|
||||||
status_now = "EXIF_FAIL"
|
status_now = "EXIF_FAIL"
|
||||||
|
|
||||||
@@ -655,8 +841,32 @@ def process_one(cur, s3: S3, cfg: AppConfig, s: EntranceSet):
|
|||||||
job_set_status(cur, s.uuid, "CATEGORIES_OK")
|
job_set_status(cur, s.uuid, "CATEGORIES_OK")
|
||||||
status_now = "CATEGORIES_OK"
|
status_now = "CATEGORIES_OK"
|
||||||
|
|
||||||
# 7) Cleanup entrance only after verify
|
# 7) Optional DeerMapper API push (image + metadata)
|
||||||
if status_rank(status_now) >= status_rank("THUMB_OK"):
|
if cfg.enable_deermapper_api and status_rank(status_now) < status_rank("API_OK"):
|
||||||
|
api_key = (cfg.deermapper_api_key or "").strip()
|
||||||
|
api_url = (cfg.deermapper_api_base_url or "").strip()
|
||||||
|
if not api_key or not api_url:
|
||||||
|
err_msg = "DeerMapper API config missing"
|
||||||
|
job_set_status(cur, s.uuid, "API_FAIL", err=err_msg)
|
||||||
|
insert_api_response(cur, s.uuid, False, err_msg)
|
||||||
|
status_now = "API_FAIL"
|
||||||
|
else:
|
||||||
|
if status_rank(status_now) >= status_rank("MOVED") and jpg_exif is jpg_bytes:
|
||||||
|
jpg_exif = s3.get_bytes(processed_key)
|
||||||
|
imei = str(meta.get("imei") or "").strip()
|
||||||
|
ok, msg = push_to_deermapper_api(cfg, s.uuid, imei, meta, jpg_exif)
|
||||||
|
if ok:
|
||||||
|
insert_api_response(cur, s.uuid, True, msg)
|
||||||
|
job_set_status(cur, s.uuid, "API_OK")
|
||||||
|
status_now = "API_OK"
|
||||||
|
else:
|
||||||
|
insert_api_response(cur, s.uuid, False, msg)
|
||||||
|
job_set_status(cur, s.uuid, "API_FAIL", err=msg)
|
||||||
|
status_now = "API_FAIL"
|
||||||
|
|
||||||
|
# 8) Cleanup entrance only after verify (+ API when enabled)
|
||||||
|
api_ready_for_cleanup = (not cfg.enable_deermapper_api) or status_rank(status_now) >= status_rank("API_OK")
|
||||||
|
if status_rank(status_now) >= status_rank("THUMB_OK") and api_ready_for_cleanup:
|
||||||
try:
|
try:
|
||||||
s3.head(processed_key)
|
s3.head(processed_key)
|
||||||
s3.head(thumb_key)
|
s3.head(thumb_key)
|
||||||
@@ -687,21 +897,22 @@ def is_completed_job_materialized(cur, s3: S3, cfg: AppConfig, uuid: str) -> boo
|
|||||||
return True
|
return True
|
||||||
|
|
||||||
|
|
||||||
def cleanup_entrance_set(s3: S3, s: EntranceSet):
|
def cleanup_entrance_set(s3: S3, s: EntranceSet) -> int:
|
||||||
to_delete = [s.jpg_key, s.meta_key]
|
to_delete = [s.jpg_key, s.meta_key]
|
||||||
if s.cat_key:
|
if s.cat_key:
|
||||||
to_delete.append(s.cat_key)
|
to_delete.append(s.cat_key)
|
||||||
s3.delete_keys(to_delete)
|
return s3.delete_keys(to_delete)
|
||||||
|
|
||||||
|
|
||||||
def run_once(cfg: AppConfig) -> int:
|
def _process_ready_batch(cfg: AppConfig, batch: list[EntranceSet]):
|
||||||
s3c = S3(cfg)
|
s3c = S3(cfg)
|
||||||
ready = collect_ready_sets(s3c, cfg)
|
|
||||||
if not ready:
|
|
||||||
return 0
|
|
||||||
|
|
||||||
with psycopg.connect(cfg.pg_dsn) as db:
|
with psycopg.connect(cfg.pg_dsn) as db:
|
||||||
for s in ready:
|
for s in batch:
|
||||||
|
start = time.perf_counter()
|
||||||
|
result = "failure"
|
||||||
|
final_status = "UNKNOWN"
|
||||||
|
err_text = None
|
||||||
|
cleanup_deleted = None
|
||||||
try:
|
try:
|
||||||
skip_duplicate = False
|
skip_duplicate = False
|
||||||
with db.transaction():
|
with db.transaction():
|
||||||
@@ -710,17 +921,56 @@ def run_once(cfg: AppConfig) -> int:
|
|||||||
skip_duplicate = True
|
skip_duplicate = True
|
||||||
else:
|
else:
|
||||||
process_one(cur, s3c, cfg, s)
|
process_one(cur, s3c, cfg, s)
|
||||||
|
final_status = (job_get(cur, s.uuid) or {}).get("status", "UNKNOWN")
|
||||||
|
result = "success" if is_terminal_success(final_status, cfg) else "failure"
|
||||||
if skip_duplicate:
|
if skip_duplicate:
|
||||||
cleanup_entrance_set(s3c, s)
|
cleanup_deleted = cleanup_entrance_set(s3c, s)
|
||||||
print(f"[SKIP] duplicate already processed: {s.uuid}")
|
final_status = "DUPLICATE_CLEANED"
|
||||||
|
result = "success"
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
with db.transaction():
|
with db.transaction():
|
||||||
with db.cursor() as cur:
|
with db.cursor() as cur:
|
||||||
job_upsert_new(cur, s.uuid, s.jpg_key, s.meta_key, s.cat_key)
|
job_upsert_new(cur, s.uuid, s.jpg_key, s.meta_key, s.cat_key)
|
||||||
job_set_status(cur, s.uuid, "ERROR", err=str(e))
|
job_set_status(cur, s.uuid, "ERROR", err=str(e))
|
||||||
print(f"[ERROR] {s.uuid}: {e}")
|
final_status = "ERROR"
|
||||||
|
err_text = str(e)
|
||||||
|
elapsed_ms = int((time.perf_counter() - start) * 1000)
|
||||||
|
if err_text:
|
||||||
|
print(
|
||||||
|
f"[RESULT] uuid={s.uuid} result={result} status={final_status} "
|
||||||
|
f"process_ms={elapsed_ms} error={err_text}"
|
||||||
|
)
|
||||||
|
elif cleanup_deleted is not None:
|
||||||
|
print(
|
||||||
|
f"[RESULT] uuid={s.uuid} result={result} status={final_status} "
|
||||||
|
f"process_ms={elapsed_ms} cleanup_deleted={cleanup_deleted}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
print(f"[RESULT] uuid={s.uuid} result={result} status={final_status} process_ms={elapsed_ms}")
|
||||||
if cfg.job_sleep_seconds > 0:
|
if cfg.job_sleep_seconds > 0:
|
||||||
time.sleep(cfg.job_sleep_seconds)
|
time.sleep(cfg.job_sleep_seconds)
|
||||||
|
|
||||||
|
|
||||||
|
def run_once(cfg: AppConfig) -> int:
|
||||||
|
s3c = S3(cfg)
|
||||||
|
ready = collect_ready_sets(s3c, cfg)
|
||||||
|
if not ready:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
workers = max(1, min(cfg.parallel_workers, len(ready)))
|
||||||
|
if workers == 1:
|
||||||
|
_process_ready_batch(cfg, ready)
|
||||||
|
return len(ready)
|
||||||
|
|
||||||
|
batches: list[list[EntranceSet]] = [[] for _ in range(workers)]
|
||||||
|
for i, s in enumerate(ready):
|
||||||
|
batches[i % workers].append(s)
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=workers) as pool:
|
||||||
|
futures = [pool.submit(_process_ready_batch, cfg, batch) for batch in batches if batch]
|
||||||
|
for future in as_completed(futures):
|
||||||
|
future.result()
|
||||||
|
|
||||||
return len(ready)
|
return len(ready)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user