Enhance OCR timestamp extraction with flexible regex matching and improved error handling

This commit is contained in:
Dom 2026-02-07 20:07:02 +00:00
parent 1e4a7b86b6
commit 3873cb7b3f

73
main.py
View File

@ -38,6 +38,9 @@ CAT_RE = re.compile(r"^([0-9a-fA-F-]{36})_categories\.json$")
OCR_DT_RE = re.compile(
r"(?P<h>\d{2}):(?P<m>\d{2}):(?P<s>\d{2})\s+(?P<d>\d{2})-(?P<mo>\d{2})-(?P<y>\d{4})"
)
OCR_DT_RE_FLEX = re.compile(
r"(?P<h>\d{2}):(?P<m>\d{2}):(?P<s>\d{2})\s*(?P<d>\d{2})[-.](?P<mo>\d{2})[-.](?P<y>\d{4})"
)
@dataclass
@ -156,27 +159,57 @@ def ocr_extract_timestamp(jpg_bytes: bytes, crop_w_frac: float, crop_h_frac: flo
cfg = r'--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789:- '
text = pytesseract.image_to_string(bw, config=cfg).strip()
m = OCR_DT_RE.search(text)
if not m:
# retry without thresholding
text2 = pytesseract.image_to_string(gray, config=cfg).strip()
m = OCR_DT_RE.search(text2)
if not m:
raise ValueError(f"OCR timestamp not found. OCR1='{text}' OCR2='{text2}'")
dt_local_naive = datetime(
int(m.group("y")),
int(m.group("mo")),
int(m.group("d")),
int(m.group("h")),
int(m.group("m")),
int(m.group("s")),
)
dt_local_naive = _parse_ocr_datetime(text, text2)
# Keep camera-local timezone (no UTC conversion requested)
return dt_local_naive.replace(tzinfo=customer_tzinfo)
def _parse_ocr_datetime(*texts: str) -> datetime:
def build_dt(h, m, s, d, mo, y):
return datetime(int(y), int(mo), int(d), int(h), int(m), int(s))
def valid_dt(h, m, s, d, mo, y) -> bool:
try:
build_dt(h, m, s, d, mo, y)
return True
except Exception:
return False
# 1) direct regex matches
for t in texts:
for rx in (OCR_DT_RE, OCR_DT_RE_FLEX):
m = rx.search(t)
if m and valid_dt(m.group("h"), m.group("m"), m.group("s"), m.group("d"), m.group("mo"), m.group("y")):
return build_dt(m.group("h"), m.group("m"), m.group("s"), m.group("d"), m.group("mo"), m.group("y"))
# 2) find time and date separately (handles missing space between time and date)
time_rx = re.compile(r"(?P<h>\d{2}):(?P<m>\d{2}):(?P<s>\d{2})")
date_rx = re.compile(r"(?P<d>\d{2})[-.](?P<mo>\d{2})[-.](?P<y>\d{4})")
for t in texts:
tm = time_rx.search(t)
dm = date_rx.search(t)
if tm and dm and valid_dt(tm.group("h"), tm.group("m"), tm.group("s"), dm.group("d"), dm.group("mo"), dm.group("y")):
return build_dt(tm.group("h"), tm.group("m"), tm.group("s"), dm.group("d"), dm.group("mo"), dm.group("y"))
# 3) digits-only fallback: scan for HHMMSSDDMMYYYY
for t in texts:
digits = re.sub(r"\D", "", t)
for i in range(0, max(0, len(digits) - 13)):
chunk = digits[i:i + 14]
if len(chunk) < 14:
continue
h, m, s = chunk[0:2], chunk[2:4], chunk[4:6]
d, mo, y = chunk[6:8], chunk[8:10], chunk[10:14]
if valid_dt(h, m, s, d, mo, y):
return build_dt(h, m, s, d, mo, y)
raise ValueError(f"OCR timestamp not found. OCR1='{texts[0] if texts else ''}' OCR2='{texts[1] if len(texts) > 1 else ''}'")
def exif_write_with_exiftool(jpg_in: bytes, dt_original_local: datetime, dt_digitized_utc: datetime) -> bytes:
"""
DateTimeOriginal = OCR time (camera local) + OffsetTimeOriginal
@ -280,11 +313,11 @@ def job_set_status(cur, uuid: str, status: str, *, err: Optional[str] = None, pr
UPDATE remote_cam.import_job
SET status = %s,
attempts = attempts + CASE WHEN %s = 'ERROR' THEN 1 ELSE 0 END,
last_error = CASE WHEN %s IS NULL THEN last_error ELSE %s END,
last_error = COALESCE(%s::text, last_error),
processed_jpg_key = COALESCE(%s, processed_jpg_key),
thumbnail_key = COALESCE(%s, thumbnail_key)
WHERE image_uuid = %s;
""", (status, status, err, err, processed_key, thumb_key, uuid))
""", (status, status, err, processed_key, thumb_key, uuid))
def insert_resource(cur, uuid: str, typ: str):
@ -314,7 +347,13 @@ def insert_metadata(cur, uuid: str, meta: dict):
lat_f = float(lat) if lat not in (None, "") else None
lon_f = float(lon) if lon not in (None, "") else None
# NOTE: DateTimeLastSettings looks like "05:45:35 30.06.2025" in sample; parse if needed later.
dls_raw = get("DateTimeLastSettings")
dls = None
if dls_raw:
try:
dls = datetime.strptime(dls_raw, "%H:%M:%S %d.%m.%Y").replace(tzinfo=timezone.utc)
except Exception:
dls = None
cur.execute("""
INSERT INTO remote_cam.metadata(
metadata_uuid, import_ts, "ImageCreationDate", "SizeInByte", "ImageFormat", "ImageNight",
@ -366,7 +405,7 @@ def insert_metadata(cur, uuid: str, meta: dict):
servertime = EXCLUDED.servertime;
""", (
uuid, icd, get("SizeInByte"), get("ImageFormat"), get("ImageNight"),
get("DateTimeLastSettings"), get("BatteryStatus"), get("FirmwareVersion"),
dls, get("BatteryStatus"), get("FirmwareVersion"),
get("SignalStrength"), get("Temperature"),
get("CoordinateSwitch"), lat_f, lon_f, get("WorkPeriod"),
get("WorkStart"), get("WorkEnd"), get("ThumbnailSize"),