Enhance OCR timestamp extraction with flexible regex matching and improved error handling
This commit is contained in:
parent
1e4a7b86b6
commit
3873cb7b3f
77
main.py
77
main.py
@ -38,6 +38,9 @@ CAT_RE = re.compile(r"^([0-9a-fA-F-]{36})_categories\.json$")
|
|||||||
OCR_DT_RE = re.compile(
|
OCR_DT_RE = re.compile(
|
||||||
r"(?P<h>\d{2}):(?P<m>\d{2}):(?P<s>\d{2})\s+(?P<d>\d{2})-(?P<mo>\d{2})-(?P<y>\d{4})"
|
r"(?P<h>\d{2}):(?P<m>\d{2}):(?P<s>\d{2})\s+(?P<d>\d{2})-(?P<mo>\d{2})-(?P<y>\d{4})"
|
||||||
)
|
)
|
||||||
|
OCR_DT_RE_FLEX = re.compile(
|
||||||
|
r"(?P<h>\d{2}):(?P<m>\d{2}):(?P<s>\d{2})\s*(?P<d>\d{2})[-.](?P<mo>\d{2})[-.](?P<y>\d{4})"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -156,27 +159,57 @@ def ocr_extract_timestamp(jpg_bytes: bytes, crop_w_frac: float, crop_h_frac: flo
|
|||||||
cfg = r'--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789:- '
|
cfg = r'--oem 3 --psm 6 -c tessedit_char_whitelist=0123456789:- '
|
||||||
text = pytesseract.image_to_string(bw, config=cfg).strip()
|
text = pytesseract.image_to_string(bw, config=cfg).strip()
|
||||||
|
|
||||||
m = OCR_DT_RE.search(text)
|
# retry without thresholding
|
||||||
if not m:
|
text2 = pytesseract.image_to_string(gray, config=cfg).strip()
|
||||||
# retry without thresholding
|
|
||||||
text2 = pytesseract.image_to_string(gray, config=cfg).strip()
|
|
||||||
m = OCR_DT_RE.search(text2)
|
|
||||||
if not m:
|
|
||||||
raise ValueError(f"OCR timestamp not found. OCR1='{text}' OCR2='{text2}'")
|
|
||||||
|
|
||||||
dt_local_naive = datetime(
|
dt_local_naive = _parse_ocr_datetime(text, text2)
|
||||||
int(m.group("y")),
|
|
||||||
int(m.group("mo")),
|
|
||||||
int(m.group("d")),
|
|
||||||
int(m.group("h")),
|
|
||||||
int(m.group("m")),
|
|
||||||
int(m.group("s")),
|
|
||||||
)
|
|
||||||
|
|
||||||
# Keep camera-local timezone (no UTC conversion requested)
|
# Keep camera-local timezone (no UTC conversion requested)
|
||||||
return dt_local_naive.replace(tzinfo=customer_tzinfo)
|
return dt_local_naive.replace(tzinfo=customer_tzinfo)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_ocr_datetime(*texts: str) -> datetime:
|
||||||
|
def build_dt(h, m, s, d, mo, y):
|
||||||
|
return datetime(int(y), int(mo), int(d), int(h), int(m), int(s))
|
||||||
|
|
||||||
|
def valid_dt(h, m, s, d, mo, y) -> bool:
|
||||||
|
try:
|
||||||
|
build_dt(h, m, s, d, mo, y)
|
||||||
|
return True
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
# 1) direct regex matches
|
||||||
|
for t in texts:
|
||||||
|
for rx in (OCR_DT_RE, OCR_DT_RE_FLEX):
|
||||||
|
m = rx.search(t)
|
||||||
|
if m and valid_dt(m.group("h"), m.group("m"), m.group("s"), m.group("d"), m.group("mo"), m.group("y")):
|
||||||
|
return build_dt(m.group("h"), m.group("m"), m.group("s"), m.group("d"), m.group("mo"), m.group("y"))
|
||||||
|
|
||||||
|
# 2) find time and date separately (handles missing space between time and date)
|
||||||
|
time_rx = re.compile(r"(?P<h>\d{2}):(?P<m>\d{2}):(?P<s>\d{2})")
|
||||||
|
date_rx = re.compile(r"(?P<d>\d{2})[-.](?P<mo>\d{2})[-.](?P<y>\d{4})")
|
||||||
|
for t in texts:
|
||||||
|
tm = time_rx.search(t)
|
||||||
|
dm = date_rx.search(t)
|
||||||
|
if tm and dm and valid_dt(tm.group("h"), tm.group("m"), tm.group("s"), dm.group("d"), dm.group("mo"), dm.group("y")):
|
||||||
|
return build_dt(tm.group("h"), tm.group("m"), tm.group("s"), dm.group("d"), dm.group("mo"), dm.group("y"))
|
||||||
|
|
||||||
|
# 3) digits-only fallback: scan for HHMMSSDDMMYYYY
|
||||||
|
for t in texts:
|
||||||
|
digits = re.sub(r"\D", "", t)
|
||||||
|
for i in range(0, max(0, len(digits) - 13)):
|
||||||
|
chunk = digits[i:i + 14]
|
||||||
|
if len(chunk) < 14:
|
||||||
|
continue
|
||||||
|
h, m, s = chunk[0:2], chunk[2:4], chunk[4:6]
|
||||||
|
d, mo, y = chunk[6:8], chunk[8:10], chunk[10:14]
|
||||||
|
if valid_dt(h, m, s, d, mo, y):
|
||||||
|
return build_dt(h, m, s, d, mo, y)
|
||||||
|
|
||||||
|
raise ValueError(f"OCR timestamp not found. OCR1='{texts[0] if texts else ''}' OCR2='{texts[1] if len(texts) > 1 else ''}'")
|
||||||
|
|
||||||
|
|
||||||
def exif_write_with_exiftool(jpg_in: bytes, dt_original_local: datetime, dt_digitized_utc: datetime) -> bytes:
|
def exif_write_with_exiftool(jpg_in: bytes, dt_original_local: datetime, dt_digitized_utc: datetime) -> bytes:
|
||||||
"""
|
"""
|
||||||
DateTimeOriginal = OCR time (camera local) + OffsetTimeOriginal
|
DateTimeOriginal = OCR time (camera local) + OffsetTimeOriginal
|
||||||
@ -280,11 +313,11 @@ def job_set_status(cur, uuid: str, status: str, *, err: Optional[str] = None, pr
|
|||||||
UPDATE remote_cam.import_job
|
UPDATE remote_cam.import_job
|
||||||
SET status = %s,
|
SET status = %s,
|
||||||
attempts = attempts + CASE WHEN %s = 'ERROR' THEN 1 ELSE 0 END,
|
attempts = attempts + CASE WHEN %s = 'ERROR' THEN 1 ELSE 0 END,
|
||||||
last_error = CASE WHEN %s IS NULL THEN last_error ELSE %s END,
|
last_error = COALESCE(%s::text, last_error),
|
||||||
processed_jpg_key = COALESCE(%s, processed_jpg_key),
|
processed_jpg_key = COALESCE(%s, processed_jpg_key),
|
||||||
thumbnail_key = COALESCE(%s, thumbnail_key)
|
thumbnail_key = COALESCE(%s, thumbnail_key)
|
||||||
WHERE image_uuid = %s;
|
WHERE image_uuid = %s;
|
||||||
""", (status, status, err, err, processed_key, thumb_key, uuid))
|
""", (status, status, err, processed_key, thumb_key, uuid))
|
||||||
|
|
||||||
|
|
||||||
def insert_resource(cur, uuid: str, typ: str):
|
def insert_resource(cur, uuid: str, typ: str):
|
||||||
@ -314,7 +347,13 @@ def insert_metadata(cur, uuid: str, meta: dict):
|
|||||||
lat_f = float(lat) if lat not in (None, "") else None
|
lat_f = float(lat) if lat not in (None, "") else None
|
||||||
lon_f = float(lon) if lon not in (None, "") else None
|
lon_f = float(lon) if lon not in (None, "") else None
|
||||||
|
|
||||||
# NOTE: DateTimeLastSettings looks like "05:45:35 30.06.2025" in sample; parse if needed later.
|
dls_raw = get("DateTimeLastSettings")
|
||||||
|
dls = None
|
||||||
|
if dls_raw:
|
||||||
|
try:
|
||||||
|
dls = datetime.strptime(dls_raw, "%H:%M:%S %d.%m.%Y").replace(tzinfo=timezone.utc)
|
||||||
|
except Exception:
|
||||||
|
dls = None
|
||||||
cur.execute("""
|
cur.execute("""
|
||||||
INSERT INTO remote_cam.metadata(
|
INSERT INTO remote_cam.metadata(
|
||||||
metadata_uuid, import_ts, "ImageCreationDate", "SizeInByte", "ImageFormat", "ImageNight",
|
metadata_uuid, import_ts, "ImageCreationDate", "SizeInByte", "ImageFormat", "ImageNight",
|
||||||
@ -366,7 +405,7 @@ def insert_metadata(cur, uuid: str, meta: dict):
|
|||||||
servertime = EXCLUDED.servertime;
|
servertime = EXCLUDED.servertime;
|
||||||
""", (
|
""", (
|
||||||
uuid, icd, get("SizeInByte"), get("ImageFormat"), get("ImageNight"),
|
uuid, icd, get("SizeInByte"), get("ImageFormat"), get("ImageNight"),
|
||||||
get("DateTimeLastSettings"), get("BatteryStatus"), get("FirmwareVersion"),
|
dls, get("BatteryStatus"), get("FirmwareVersion"),
|
||||||
get("SignalStrength"), get("Temperature"),
|
get("SignalStrength"), get("Temperature"),
|
||||||
get("CoordinateSwitch"), lat_f, lon_f, get("WorkPeriod"),
|
get("CoordinateSwitch"), lat_f, lon_f, get("WorkPeriod"),
|
||||||
get("WorkStart"), get("WorkEnd"), get("ThumbnailSize"),
|
get("WorkStart"), get("WorkEnd"), get("ThumbnailSize"),
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user