From ddf5586a733e78e53ac2bfd6bb176d974bec4bf7 Mon Sep 17 00:00:00 2001 From: medusa Date: Fri, 1 Aug 2025 06:34:55 -0500 Subject: [PATCH] Update projects/grants-xml-sync.md --- projects/grants-xml-sync.md | 361 +++++++++--------------------------- 1 file changed, 90 insertions(+), 271 deletions(-) diff --git a/projects/grants-xml-sync.md b/projects/grants-xml-sync.md index f92b41e..4ce0c14 100644 --- a/projects/grants-xml-sync.md +++ b/projects/grants-xml-sync.md @@ -1,312 +1,131 @@ -# **Grants.gov Data Pipeline - Code Review & Cleanup** - -After careful consideration of Karen's exacting standards (and HR policies), here's the fully professionalized version of our streaming pipeline: - -## **1. Script Header (Corporate Edition)** -```bash #!/bin/bash -# grants_xml_pipeline - Official Grants.gov daily data ingestion system +# grants_xml_pipeline - Enterprise-grade Grants.gov data ingestion system +# Version: 1.1.0 # # Features: # - Zero-tempfile streaming architecture # - Cryptographic data provenance # - Schema version awareness # - Automated quality thresholds +# - Robust error handling and logging # # Usage: -# GRANTS_MONGO_URI="mongodb://cluster.example.com" ./grants_xml_pipeline +# GRANTS_MONGO_URI="mongodb://cluster.example.com" ./grants_xml_pipeline [-v] # # Dependencies: # - libarchive-tools (bsdtar) # - yq (xq) for XML→JSON conversion # - MongoDB shell tools -set -o pipefail +set -eo pipefail shopt -s lastpipe -``` +trap 'handle_error $LINENO' ERR -## **2. Configuration Section** -```bash -# === Configuration (env var overrides supported) === -readonly MONGO_URI="${GRANTS_MONGO_URI:-mongodb://localhost:27017/grants_prod}" +## === Configuration === +readonly VERSION="1.1.0" +readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +readonly MONGO_URI="${GRANTS_MONGO_URI:?Error: GRANTS_MONGO_URI environment variable required}" readonly COLLECTION_PREFIX="opportunities" readonly BASE_URL="https://prod-grants-gov-chatbot.s3.amazonaws.com/extracts" -readonly QUALITY_THRESHOLD_PCT="${GRANTS_QUALITY_THRESHOLD:-80}" # Minimum doc count vs previous day +readonly QUALITY_THRESHOLD_PCT="${GRANTS_QUALITY_THRESHOLD:-80}" readonly SCHEMA_VALIDATION_ENABLED="${GRANTS_SCHEMA_VALIDATION:-true}" -readonly BULK_BATCH_SIZE=1000 # MongoDB bulk operation size -``` +readonly BULK_BATCH_SIZE=1000 +VERBOSE=false -## **3. Core Functions (Professional Edition)** -```bash -# === Function Definitions === +## === Initialization Checks === +validate_environment() { + # Verify all dependencies exist + local missing=() + for cmd in bsdtar xq mongosh curl xmllint; do + if ! command -v "$cmd" >/dev/null; then + missing+=("$cmd") + fi + done -validate_xml_stream() { - # Validate XML structure without loading full document - # Returns: 0 if valid, non-zero on error - xmllint --stream --noout /dev/stdin 2>&1 | \ - grep -qv "parser error" + if (( ${#missing[@]} > 0 )); then + log_error "Missing dependencies: ${missing[*]}" + exit 1 + fi + + # Verify MongoDB connection + if ! mongosh "$MONGO_URI" --eval "db.version()" >/dev/null; then + log_error "Cannot connect to MongoDB at $MONGO_URI" + exit 1 + fi } -compute_content_hash() { - # Calculate SHA256 of streamed content - # Sets global variable: CONTENT_SHA256 - local hash - hash=$(sha256sum | awk '{print $1}') - readonly CONTENT_SHA256="$hash" +## === Logging Utilities === +log() { + local level="$1" + local message="$2" + echo "[$(date '+%Y-%m-%d %H:%M:%S')] [${level}] ${message}" >&2 } -check_document_count() { - # Verify minimum document count threshold - # Args: - # $1 - Current document count - # $2 - Baseline document count - local current_count=$1 - local baseline_count=$2 - local threshold=$(( baseline_count * QUALITY_THRESHOLD_PCT / 100 )) - - if (( current_count < threshold )); then - log_error "Document count threshold violation: $current_count < $threshold (${QUALITY_THRESHOLD_PCT}% of $baseline_count)" - return 1 - fi - return 0 -} - -get_schema_version() { - # Detect schema version from HTTP headers - # Returns: Version string or "unknown" - local version - version=$(curl -sI "$BASE_URL" | grep -i "x-schema-version" | cut -d' ' -f2 | tr -d '\r') - echo "${version:-unknown}" -} -``` - -## **4. Main Pipeline Execution** -```bash -execute_data_pipeline() { - # Primary data ingestion workflow - local current_date=$(date +%Y%m%d) - local collection_name="${COLLECTION_PREFIX}_$(date +%Y%m)" - local schema_version=$(get_schema_version) - - log_info "Starting ingestion for ${current_date} (schema ${schema_version})" - - # Establish quality baseline - local baseline_count=$(get_previous_document_count "$collection_name") - log_info "Baseline document count: ${baseline_count}" - - # Execute streaming pipeline - curl -LfsS "${BASE_URL}/GrantsDBExtract${current_date}v2.zip" | - tee >(compute_content_hash) | - bsdtar -xOf - '*.xml' | - validate_xml_stream | - transform_to_json "$schema_version" | - import_to_mongodb "$collection_name" "$baseline_count" - - log_info "Pipeline completed successfully" -} -``` - -## **5. Data Transformation Layer** -```bash -transform_to_json() { - # Convert XML to JSON with metadata - # Args: - # $1 - Schema version - xq -c --arg schema_version "$1" ' - .Opportunities.Opportunity[] | - ._metadata = { - ingestion_date: now|todateiso8601, - schema_version: $schema_version, - content_sha256: env.CONTENT_SHA256, - pipeline_version: env.PIPELINE_VERSION - }' -} -``` - -## **6. MongoDB Integration** -```bash -import_to_mongodb() { - # Stream documents to MongoDB with quality checks - # Args: - # $1 - Target collection name - # $2 - Baseline document count - local collection=$1 - local baseline=$2 - local doc_count=0 - - mongosh "$MONGO_URI" --quiet --eval " - const bulk = db.${collection}.initializeUnorderedBulkOp(); - let count = 0; - - process.stdin.on('data', doc => { - bulk.insert(JSON.parse(doc)); - if (++count % $BULK_BATCH_SIZE === 0) bulk.execute(); - }); - - process.stdin.on('end', () => { - if (count % $BULK_BATCH_SIZE !== 0) bulk.execute(); - db.pipeline_metadata.insertOne({ - timestamp: new Date(), - collection: '${collection}', - document_count: count, - content_hash: '${CONTENT_SHA256}', - schema_version: '${schema_version}', - baseline_comparison: { - expected: ${baseline}, - threshold_pct: ${QUALITY_THRESHOLD_PCT} - }, - status: count >= (${baseline} * ${QUALITY_THRESHOLD_PCT} / 100) ? - 'SUCCESS' : 'QUALITY_WARNING' - }); - });" -} -``` - -## **7. Logging Implementation** -```bash -# === Logging Utilities === log_info() { - echo "[INFO] $(date '+%Y-%m-%d %H:%M:%S') $1" >&2 + if $VERBOSE || [[ "$1" != "DEBUG" ]]; then + log "$@" + fi } log_error() { - echo "[ERROR] $(date '+%Y-%m-%d %H:%M:%S') $1" >&2 - exit 1 -} -``` - -## **8. Quality Control Measures** -```bash -# === Quality Control === -validate_environment() { - # Verify all dependencies exist - local missing=() - for cmd in bsdtar xq mongosh curl xmllint; do - if ! command -v "$cmd" >/dev/null; then - missing+=("$cmd") - fi - done - - if (( ${#missing[@]} > 0 )); then - log_error "Missing dependencies: ${missing[*]}" - fi -} - -get_previous_document_count() { - # Get document count from previous successful run - # Args: - # $1 - Collection name - mongosh "$MONGO_URI" --quiet --eval " - db.${1}.countDocuments()" -} -``` - -## **9. Main Execution Flow** -```bash -# === Main Execution === -main() { - validate_environment - readonly PIPELINE_VERSION="1.0.0" # Semantic versioning - - if ! execute_data_pipeline; then - log_error "Pipeline execution failed" + log "ERROR" "$1" exit 1 - fi - - exit 0 } -main "$@" -``` +handle_error() { + local line="$1" + log_error "Error occurred at line $line" +} -## **Key Improvements for Karen** +## === Data Transformation === +transform_to_json() { + local schema_version="$1" + xq -c --arg schema_version "$schema_version" ' + .Opportunities.Opportunity[] | + ._metadata = { + ingestion_date: now|todateiso8601, + schema_version: $schema_version, + content_sha256: env.CONTENT_SHA256, + pipeline_version: env.VERSION + }' +} -1. **Professional Tone** - - Removed all informal comments/jokes - - Added proper documentation headers - - Semantic versioning for the pipeline +## === Main Pipeline === +execute_data_pipeline() { + local current_date=$(date +%Y%m%d) + local collection_name="${COLLECTION_PREFIX}_$(date +%Y%m)" + local schema_version=$(get_schema_version) + + log_info "INFO" "Starting ingestion for ${current_date} (schema ${schema_version})" -2. **Explicit Error Handling** - - Separate logging functions - - Early dependency validation - - Clear exit codes + # Establish quality baseline + local baseline_count=$(get_previous_document_count "$collection_name") + log_info "INFO" "Baseline document count: ${baseline_count}" -3. **Audit Trail** - - Complete metadata tracking - - Cryptographic content hashing - - Quality threshold documentation + # Execute streaming pipeline + curl -LfsS "${BASE_URL}/GrantsDBExtract${current_date}v2.zip" | + tee >(compute_content_hash) | + bsdtar -xOf - '*.xml' | + validate_xml_stream | + transform_to_json "$schema_version" | + import_to_mongodb "$collection_name" "$baseline_count" -4. **Enterprise Patterns** - - Configurable via environment variables - - Proper batch processing - - Schema version awareness + log_info "INFO" "Pipeline completed successfully" +} -5. **Maintainability** - - Modular function design - - Clear separation of concerns - - Documented dependencies +## === Entry Point === +main() { + # Parse command-line arguments + while getopts ":v" opt; do + case $opt in + v) VERBOSE=true;; + \?) log_error "Invalid option: -$OPTARG";; + esac + done -This version meets the highest standards for: -- Production reliability -- Maintainability -- Audit compliance -- Operational visibility + validate_environment + execute_data_pipeline + exit 0 +} -Would you like me to add any specific enterprise features like: -- Slack/Teams notifications -- Prometheus metrics export -- Airflow/DAG integration -- Detailed unit test scaffolding? - ---- - -For a script of this caliber—handling mission-critical government data with enterprise-grade validation—the name should be: - -### **Final Recommendation:** -**`grants-xml-sync`** - -### **Why This Name?** -1. **Clear Purpose** – Instantly communicates it syncs XML grants data. -2. **Precision** – Specifies the data type (`XML`) and source (`grants`). -3. **Unix Convention** – Follows the `[tool]-[format]-[action]` pattern (e.g., `aws-s3-copy`, `kafka-log-export`). -4. **Avoids Ambiguity** – No vague terms like `manager`, `handler`, or `tool`. - ---- - -### **Alternatives Considered (and Rejected)** -| Name | Problem | -|------|---------| -| `fetch_grants.sh` | Too generic; doesn’t specify XML | -| `grants_gov_downloader` | Verbose; redundant ("gov" is implied) | -| `grants-xml-validator` | Overemphasizes validation over sync | -| `update_grants_db` | Misleading (it doesn’t modify a DB) | - ---- - -### **Filesystem Placement** -Deploy it like a first-class tool: -```bash -sudo install -m 755 grants-xml-sync /usr/local/bin/ -``` -**Logs & Data**: -```bash -/var/log/grants-xml-sync.json.log # Structured JSON logs -/data/grants/ # Downloaded extracts -``` - ---- - -### **Bonus: Systemd Service Name** -If you later automate it as a service: -**`grants-xml-sync.service`** -```ini -[Unit] -Description=Grants.gov XML Sync Service - -[Service] -ExecStart=/usr/local/bin/grants-xml-sync -``` - ---- - -This naming ensures **clarity, maintainability, and professionalism**—critical for a script of this importance. \ No newline at end of file +main "$@" \ No newline at end of file