diff --git a/data_handler/Containerfile b/data_handler/Containerfile index f9a3f6c..5cdf7b9 100644 --- a/data_handler/Containerfile +++ b/data_handler/Containerfile @@ -1,5 +1,5 @@ FROM ghcr.io/asohh/fedora-minimal-containers/php-cli:43 -RUN microdnf install -y php-mysqli php-bcmath php-xml gunzip https://raw.githubusercontent.com/rpmsphere/x86_64/master/d/dcron-4.5-7.1.x86_64.rpm && microdnf clean all +RUN microdnf install -y php-mysqli php-bcmath php-xml python-lxml gunzip https://raw.githubusercontent.com/rpmsphere/x86_64/master/d/dcron-4.5-7.1.x86_64.rpm && microdnf clean all COPY ./utils/* /opt/ COPY ./data_init/*.sql /opt/init/init.sql diff --git a/data_handler/utils/create_camera_update_statements_download.sh b/data_handler/utils/create_camera_update_statements_download.sh index 17ccdbb..b55ad3b 100644 --- a/data_handler/utils/create_camera_update_statements_download.sh +++ b/data_handler/utils/create_camera_update_statements_download.sh @@ -22,11 +22,11 @@ mkdir -p logs seq "$((lastSeqNum + 1))" "$newSeqNum" > seq_list.txt -cat seq_list.txt | xargs -n1 -P7 -I{} bash -c ' +cat seq_list.txt | xargs -n1 -P10 -I{} bash -c ' seqnum="$1" f=$(printf "%09d" "$seqnum") path=$(echo "$f" | sed -E "s#(...)(...)(...)#\1/\2/\3#") url="${REPLICATE_URL}/${path}.osc.gz" echo "Downloading $url" - curl -sL "$url" -o "${seqnum}.osc.gz" && gunzip "${seqnum}.osc.gz" + curl -sL "$url" -o "${seqnum}.osc.gz" && gunzip "${seqnum}.osc.gz" && python filter_osc.py ${seqnum} && rm ${seqnum}.osc ' _ {} diff --git a/data_handler/utils/create_camera_update_statements_process.sh b/data_handler/utils/create_camera_update_statements_process.sh index 87267e9..4744dff 100644 --- a/data_handler/utils/create_camera_update_statements_process.sh +++ b/data_handler/utils/create_camera_update_statements_process.sh @@ -50,21 +50,21 @@ curSeqNum=$lastSeqNum curSeqNum=$(( $curSeqNum + 1 )) while [ $curSeqNum -lt $newSeqNum ] do - if [ -e "$curSeqNum.osc" ] + if [ -e "${curSeqNum}_filtered.osc" ] then rm "change_file.osc" - cp $curSeqNum.osc change_file.osc + cp ${curSeqNum}_filtered.osc change_file.osc php create_camera_update_statements.php $curSeqNum - rm $curSeqNum.osc + rm ${curSeqNum}_filtered.osc curSeqNum=$(( $curSeqNum + 1 )) else path=$(echo "$(printf "%09d" "$curSeqNum")" | sed -E "s#(...)(...)(...)#\1/\2/\3#") url="${REPLICATE_URL}/${path}.osc.gz" echo $url - curl -sL "$url" -o "$curSeqNum.osc.gz" && gunzip "$curSeqNum.osc.gz" + curl -sL "$url" -o "$curSeqNum.osc.gz" && gunzip "$curSeqNum.osc.gz" && python filter_osc.py ${curSeqNum} && rm ${curSeqNum}.osc fi done diff --git a/data_handler/utils/filter_osc.py b/data_handler/utils/filter_osc.py new file mode 100644 index 0000000..0baea64 --- /dev/null +++ b/data_handler/utils/filter_osc.py @@ -0,0 +1,90 @@ +import sys +import gzip +from lxml import etree + +def open_maybe_gzip(filename, mode="rb"): + """Open file normally or as gzip based on extension.""" + if filename.endswith(".gz"): + return gzip.open(filename, mode) + return open(filename, mode) + +def process_file(number): + input_file = f"{number}.osc" + filtered_file = f"{number}_filtered.osc" + deleted_file = f"/migrations/{number}_deleted.osc" + + # --- open input and outputs --- + with open_maybe_gzip(input_file, "rb") as infile, \ + open(filtered_file, "wb") as out_filt, \ + open(deleted_file, "wb") as out_del: + + # XML headers + out_filt.write(b"\n") + out_filt.write(b"\n") + + out_del.write(b"\n") + out_del.write(b"\n") + + context = etree.iterparse(infile, events=("start", "end")) + current_section = None + buffer_filt = None + buffer_del = None + + for event, elem in context: + if event == "start" and elem.tag in ("create", "modify", "delete"): + current_section = elem.tag + if current_section == "delete": + buffer_del = etree.Element("delete") + else: + buffer_filt = etree.Element(current_section) + + elif event == "end" and elem.tag == "node" and current_section: + node_copy = etree.fromstring(etree.tostring(elem)) + + if current_section in ("create", "modify"): + tags = {t.get("k"): t.get("v") for t in elem.findall("tag")} + if tags.get("surveillance:type") == "camera": + buffer_filt.append(node_copy) + + elif current_section == "delete": + buffer_del.append(node_copy) + + elem.clear() + + elif event == "end" and elem.tag in ("create", "modify"): + if buffer_filt is not None and len(buffer_filt): + out_filt.write(f" <{current_section}>\n".encode()) + for node in buffer_filt: + out_filt.write(etree.tostring(node, encoding="utf-8")) + out_filt.write(b"\n") + out_filt.write(f" \n".encode()) + buffer_filt = None + current_section = None + elem.clear() + + elif event == "end" and elem.tag == "delete": + if buffer_del is not None and len(buffer_del): + out_del.write(b" \n") + for node in buffer_del: + out_del.write(etree.tostring(node, encoding="utf-8")) + out_del.write(b"\n") + out_del.write(b" \n") + buffer_del = None + current_section = None + elem.clear() + + out_filt.write(b"\n") + out_del.write(b"\n") + + print(f"✅ Processed {input_file}") + print(f" → {filtered_file} (camera nodes)") + print(f" → {deleted_file} (delete nodes)") + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python filter_replication.py ") + print("Example: python filter_replication.py 123") + sys.exit(1) + + number = sys.argv[1] + process_file(number) diff --git a/docker-compose.yml b/docker-compose.yml index 1d60cac..f64f158 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,6 +8,8 @@ services: MYSQL_PASSWORD: camerapassword # ${{secrets.MYSQL_PASSWORD}} ports: - "3306:3306" + volumes: + - ./mariadb:/var/lib/mysql:Z healthcheck: test: ["CMD", "mariadb-admin", "ping", "-h", "localhost", "-uroot", "-prootpassword"] interval: 10s @@ -17,6 +19,7 @@ services: web: image: git.hamburg.ccc.de/ccchh/sunders/web:latest + build: ./web/ environment: MYSQL_HOST: db MYSQL_DB: camera # ${{secrets.MYSQL_DATABASE}} @@ -34,6 +37,7 @@ services: data_handler: image: git.hamburg.ccc.de/ccchh/sunders/data_handler:latest + build: ./data_handler/ environment: MYSQL_HOST: db MYSQL_DB: camera # ${{secrets.MYSQL_DATABASE}}