Enhance OCR timestamp extraction with flexible regex matching and improved error handling
This commit is contained in:
parent
1e4a7b86b6
commit
3873cb7b3f
73
main.py
73
main.py
@ -38,6 +38,9 @@ CAT_RE = re.compile(r"^([0-9a-fA-F-]{36})_categories\.json$")
|
||||
OCR_DT_RE = re.compile(
|
||||
r"(?P<h>\d{2}):(?P<m>\d{2}):(?P<s>\d{2})\s+(?P<d>\d{2})-(?P<mo>\d{2})-(?P<y>\d{4})"
|
||||
)
|
||||
OCR_DT_RE_FLEX = re.compile(
|
||||
r"(?P<h>\d{2}):(?P<m>\d{2}):(?P<s>\d{2})\s*(?P<d>\d{2})[-.](?P<mo>\d{2})[-.](?P<y>\d{4})"
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -156,27 +159,57 @@ def ocr_extract_timestamp(jpg_bytes: bytes, crop_w_frac: float, crop_h_frac: flo
|
||||
cfg = r'--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789:- '
|
||||
text = pytesseract.image_to_string(bw, config=cfg).strip()
|
||||
|
||||
m = OCR_DT_RE.search(text)
|
||||
if not m:
|
||||
# retry without thresholding
|
||||
text2 = pytesseract.image_to_string(gray, config=cfg).strip()
|
||||
m = OCR_DT_RE.search(text2)
|
||||
if not m:
|
||||
raise ValueError(f"OCR timestamp not found. OCR1='{text}' OCR2='{text2}'")
|
||||
|
||||
dt_local_naive = datetime(
|
||||
int(m.group("y")),
|
||||
int(m.group("mo")),
|
||||
int(m.group("d")),
|
||||
int(m.group("h")),
|
||||
int(m.group("m")),
|
||||
int(m.group("s")),
|
||||
)
|
||||
dt_local_naive = _parse_ocr_datetime(text, text2)
|
||||
|
||||
# Keep camera-local timezone (no UTC conversion requested)
|
||||
return dt_local_naive.replace(tzinfo=customer_tzinfo)
|
||||
|
||||
|
||||
def _parse_ocr_datetime(*texts: str) -> datetime:
|
||||
def build_dt(h, m, s, d, mo, y):
|
||||
return datetime(int(y), int(mo), int(d), int(h), int(m), int(s))
|
||||
|
||||
def valid_dt(h, m, s, d, mo, y) -> bool:
|
||||
try:
|
||||
build_dt(h, m, s, d, mo, y)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
# 1) direct regex matches
|
||||
for t in texts:
|
||||
for rx in (OCR_DT_RE, OCR_DT_RE_FLEX):
|
||||
m = rx.search(t)
|
||||
if m and valid_dt(m.group("h"), m.group("m"), m.group("s"), m.group("d"), m.group("mo"), m.group("y")):
|
||||
return build_dt(m.group("h"), m.group("m"), m.group("s"), m.group("d"), m.group("mo"), m.group("y"))
|
||||
|
||||
# 2) find time and date separately (handles missing space between time and date)
|
||||
time_rx = re.compile(r"(?P<h>\d{2}):(?P<m>\d{2}):(?P<s>\d{2})")
|
||||
date_rx = re.compile(r"(?P<d>\d{2})[-.](?P<mo>\d{2})[-.](?P<y>\d{4})")
|
||||
for t in texts:
|
||||
tm = time_rx.search(t)
|
||||
dm = date_rx.search(t)
|
||||
if tm and dm and valid_dt(tm.group("h"), tm.group("m"), tm.group("s"), dm.group("d"), dm.group("mo"), dm.group("y")):
|
||||
return build_dt(tm.group("h"), tm.group("m"), tm.group("s"), dm.group("d"), dm.group("mo"), dm.group("y"))
|
||||
|
||||
# 3) digits-only fallback: scan for HHMMSSDDMMYYYY
|
||||
for t in texts:
|
||||
digits = re.sub(r"\D", "", t)
|
||||
for i in range(0, max(0, len(digits) - 13)):
|
||||
chunk = digits[i:i + 14]
|
||||
if len(chunk) < 14:
|
||||
continue
|
||||
h, m, s = chunk[0:2], chunk[2:4], chunk[4:6]
|
||||
d, mo, y = chunk[6:8], chunk[8:10], chunk[10:14]
|
||||
if valid_dt(h, m, s, d, mo, y):
|
||||
return build_dt(h, m, s, d, mo, y)
|
||||
|
||||
raise ValueError(f"OCR timestamp not found. OCR1='{texts[0] if texts else ''}' OCR2='{texts[1] if len(texts) > 1 else ''}'")
|
||||
|
||||
|
||||
def exif_write_with_exiftool(jpg_in: bytes, dt_original_local: datetime, dt_digitized_utc: datetime) -> bytes:
|
||||
"""
|
||||
DateTimeOriginal = OCR time (camera local) + OffsetTimeOriginal
|
||||
@ -280,11 +313,11 @@ def job_set_status(cur, uuid: str, status: str, *, err: Optional[str] = None, pr
|
||||
UPDATE remote_cam.import_job
|
||||
SET status = %s,
|
||||
attempts = attempts + CASE WHEN %s = 'ERROR' THEN 1 ELSE 0 END,
|
||||
last_error = CASE WHEN %s IS NULL THEN last_error ELSE %s END,
|
||||
last_error = COALESCE(%s::text, last_error),
|
||||
processed_jpg_key = COALESCE(%s, processed_jpg_key),
|
||||
thumbnail_key = COALESCE(%s, thumbnail_key)
|
||||
WHERE image_uuid = %s;
|
||||
""", (status, status, err, err, processed_key, thumb_key, uuid))
|
||||
""", (status, status, err, processed_key, thumb_key, uuid))
|
||||
|
||||
|
||||
def insert_resource(cur, uuid: str, typ: str):
|
||||
@ -314,7 +347,13 @@ def insert_metadata(cur, uuid: str, meta: dict):
|
||||
lat_f = float(lat) if lat not in (None, "") else None
|
||||
lon_f = float(lon) if lon not in (None, "") else None
|
||||
|
||||
# NOTE: DateTimeLastSettings looks like "05:45:35 30.06.2025" in sample; parse if needed later.
|
||||
dls_raw = get("DateTimeLastSettings")
|
||||
dls = None
|
||||
if dls_raw:
|
||||
try:
|
||||
dls = datetime.strptime(dls_raw, "%H:%M:%S %d.%m.%Y").replace(tzinfo=timezone.utc)
|
||||
except Exception:
|
||||
dls = None
|
||||
cur.execute("""
|
||||
INSERT INTO remote_cam.metadata(
|
||||
metadata_uuid, import_ts, "ImageCreationDate", "SizeInByte", "ImageFormat", "ImageNight",
|
||||
@ -366,7 +405,7 @@ def insert_metadata(cur, uuid: str, meta: dict):
|
||||
servertime = EXCLUDED.servertime;
|
||||
""", (
|
||||
uuid, icd, get("SizeInByte"), get("ImageFormat"), get("ImageNight"),
|
||||
get("DateTimeLastSettings"), get("BatteryStatus"), get("FirmwareVersion"),
|
||||
dls, get("BatteryStatus"), get("FirmwareVersion"),
|
||||
get("SignalStrength"), get("Temperature"),
|
||||
get("CoordinateSwitch"), lat_f, lon_f, get("WorkPeriod"),
|
||||
get("WorkStart"), get("WorkEnd"), get("ThumbnailSize"),
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user