Update projects/grants-xml-sync.md
This commit is contained in:
@@ -1,183 +1,38 @@
|
|||||||
# **Grants.gov Data Pipeline - Code Review & Cleanup**
|
|
||||||
|
|
||||||
After careful consideration of Karen's exacting standards (and HR policies), here's the fully professionalized version of our streaming pipeline:
|
|
||||||
|
|
||||||
## **1. Script Header (Corporate Edition)**
|
|
||||||
```bash
|
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# grants_xml_pipeline - Official Grants.gov daily data ingestion system
|
# grants_xml_pipeline - Enterprise-grade Grants.gov data ingestion system
|
||||||
|
# Version: 1.1.0
|
||||||
#
|
#
|
||||||
# Features:
|
# Features:
|
||||||
# - Zero-tempfile streaming architecture
|
# - Zero-tempfile streaming architecture
|
||||||
# - Cryptographic data provenance
|
# - Cryptographic data provenance
|
||||||
# - Schema version awareness
|
# - Schema version awareness
|
||||||
# - Automated quality thresholds
|
# - Automated quality thresholds
|
||||||
|
# - Robust error handling and logging
|
||||||
#
|
#
|
||||||
# Usage:
|
# Usage:
|
||||||
# GRANTS_MONGO_URI="mongodb://cluster.example.com" ./grants_xml_pipeline
|
# GRANTS_MONGO_URI="mongodb://cluster.example.com" ./grants_xml_pipeline [-v]
|
||||||
#
|
#
|
||||||
# Dependencies:
|
# Dependencies:
|
||||||
# - libarchive-tools (bsdtar)
|
# - libarchive-tools (bsdtar)
|
||||||
# - yq (xq) for XML→JSON conversion
|
# - yq (xq) for XML→JSON conversion
|
||||||
# - MongoDB shell tools
|
# - MongoDB shell tools
|
||||||
|
|
||||||
set -o pipefail
|
set -eo pipefail
|
||||||
shopt -s lastpipe
|
shopt -s lastpipe
|
||||||
```
|
trap 'handle_error $LINENO' ERR
|
||||||
|
|
||||||
## **2. Configuration Section**
|
## === Configuration ===
|
||||||
```bash
|
readonly VERSION="1.1.0"
|
||||||
# === Configuration (env var overrides supported) ===
|
readonly SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
readonly MONGO_URI="${GRANTS_MONGO_URI:-mongodb://localhost:27017/grants_prod}"
|
readonly MONGO_URI="${GRANTS_MONGO_URI:?Error: GRANTS_MONGO_URI environment variable required}"
|
||||||
readonly COLLECTION_PREFIX="opportunities"
|
readonly COLLECTION_PREFIX="opportunities"
|
||||||
readonly BASE_URL="https://prod-grants-gov-chatbot.s3.amazonaws.com/extracts"
|
readonly BASE_URL="https://prod-grants-gov-chatbot.s3.amazonaws.com/extracts"
|
||||||
readonly QUALITY_THRESHOLD_PCT="${GRANTS_QUALITY_THRESHOLD:-80}" # Minimum doc count vs previous day
|
readonly QUALITY_THRESHOLD_PCT="${GRANTS_QUALITY_THRESHOLD:-80}"
|
||||||
readonly SCHEMA_VALIDATION_ENABLED="${GRANTS_SCHEMA_VALIDATION:-true}"
|
readonly SCHEMA_VALIDATION_ENABLED="${GRANTS_SCHEMA_VALIDATION:-true}"
|
||||||
readonly BULK_BATCH_SIZE=1000 # MongoDB bulk operation size
|
readonly BULK_BATCH_SIZE=1000
|
||||||
```
|
VERBOSE=false
|
||||||
|
|
||||||
## **3. Core Functions (Professional Edition)**
|
## === Initialization Checks ===
|
||||||
```bash
|
|
||||||
# === Function Definitions ===
|
|
||||||
|
|
||||||
validate_xml_stream() {
|
|
||||||
# Validate XML structure without loading full document
|
|
||||||
# Returns: 0 if valid, non-zero on error
|
|
||||||
xmllint --stream --noout /dev/stdin 2>&1 | \
|
|
||||||
grep -qv "parser error"
|
|
||||||
}
|
|
||||||
|
|
||||||
compute_content_hash() {
|
|
||||||
# Calculate SHA256 of streamed content
|
|
||||||
# Sets global variable: CONTENT_SHA256
|
|
||||||
local hash
|
|
||||||
hash=$(sha256sum | awk '{print $1}')
|
|
||||||
readonly CONTENT_SHA256="$hash"
|
|
||||||
}
|
|
||||||
|
|
||||||
check_document_count() {
|
|
||||||
# Verify minimum document count threshold
|
|
||||||
# Args:
|
|
||||||
# $1 - Current document count
|
|
||||||
# $2 - Baseline document count
|
|
||||||
local current_count=$1
|
|
||||||
local baseline_count=$2
|
|
||||||
local threshold=$(( baseline_count * QUALITY_THRESHOLD_PCT / 100 ))
|
|
||||||
|
|
||||||
if (( current_count < threshold )); then
|
|
||||||
log_error "Document count threshold violation: $current_count < $threshold (${QUALITY_THRESHOLD_PCT}% of $baseline_count)"
|
|
||||||
return 1
|
|
||||||
fi
|
|
||||||
return 0
|
|
||||||
}
|
|
||||||
|
|
||||||
get_schema_version() {
|
|
||||||
# Detect schema version from HTTP headers
|
|
||||||
# Returns: Version string or "unknown"
|
|
||||||
local version
|
|
||||||
version=$(curl -sI "$BASE_URL" | grep -i "x-schema-version" | cut -d' ' -f2 | tr -d '\r')
|
|
||||||
echo "${version:-unknown}"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## **4. Main Pipeline Execution**
|
|
||||||
```bash
|
|
||||||
execute_data_pipeline() {
|
|
||||||
# Primary data ingestion workflow
|
|
||||||
local current_date=$(date +%Y%m%d)
|
|
||||||
local collection_name="${COLLECTION_PREFIX}_$(date +%Y%m)"
|
|
||||||
local schema_version=$(get_schema_version)
|
|
||||||
|
|
||||||
log_info "Starting ingestion for ${current_date} (schema ${schema_version})"
|
|
||||||
|
|
||||||
# Establish quality baseline
|
|
||||||
local baseline_count=$(get_previous_document_count "$collection_name")
|
|
||||||
log_info "Baseline document count: ${baseline_count}"
|
|
||||||
|
|
||||||
# Execute streaming pipeline
|
|
||||||
curl -LfsS "${BASE_URL}/GrantsDBExtract${current_date}v2.zip" |
|
|
||||||
tee >(compute_content_hash) |
|
|
||||||
bsdtar -xOf - '*.xml' |
|
|
||||||
validate_xml_stream |
|
|
||||||
transform_to_json "$schema_version" |
|
|
||||||
import_to_mongodb "$collection_name" "$baseline_count"
|
|
||||||
|
|
||||||
log_info "Pipeline completed successfully"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## **5. Data Transformation Layer**
|
|
||||||
```bash
|
|
||||||
transform_to_json() {
|
|
||||||
# Convert XML to JSON with metadata
|
|
||||||
# Args:
|
|
||||||
# $1 - Schema version
|
|
||||||
xq -c --arg schema_version "$1" '
|
|
||||||
.Opportunities.Opportunity[] |
|
|
||||||
._metadata = {
|
|
||||||
ingestion_date: now|todateiso8601,
|
|
||||||
schema_version: $schema_version,
|
|
||||||
content_sha256: env.CONTENT_SHA256,
|
|
||||||
pipeline_version: env.PIPELINE_VERSION
|
|
||||||
}'
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## **6. MongoDB Integration**
|
|
||||||
```bash
|
|
||||||
import_to_mongodb() {
|
|
||||||
# Stream documents to MongoDB with quality checks
|
|
||||||
# Args:
|
|
||||||
# $1 - Target collection name
|
|
||||||
# $2 - Baseline document count
|
|
||||||
local collection=$1
|
|
||||||
local baseline=$2
|
|
||||||
local doc_count=0
|
|
||||||
|
|
||||||
mongosh "$MONGO_URI" --quiet --eval "
|
|
||||||
const bulk = db.${collection}.initializeUnorderedBulkOp();
|
|
||||||
let count = 0;
|
|
||||||
|
|
||||||
process.stdin.on('data', doc => {
|
|
||||||
bulk.insert(JSON.parse(doc));
|
|
||||||
if (++count % $BULK_BATCH_SIZE === 0) bulk.execute();
|
|
||||||
});
|
|
||||||
|
|
||||||
process.stdin.on('end', () => {
|
|
||||||
if (count % $BULK_BATCH_SIZE !== 0) bulk.execute();
|
|
||||||
db.pipeline_metadata.insertOne({
|
|
||||||
timestamp: new Date(),
|
|
||||||
collection: '${collection}',
|
|
||||||
document_count: count,
|
|
||||||
content_hash: '${CONTENT_SHA256}',
|
|
||||||
schema_version: '${schema_version}',
|
|
||||||
baseline_comparison: {
|
|
||||||
expected: ${baseline},
|
|
||||||
threshold_pct: ${QUALITY_THRESHOLD_PCT}
|
|
||||||
},
|
|
||||||
status: count >= (${baseline} * ${QUALITY_THRESHOLD_PCT} / 100) ?
|
|
||||||
'SUCCESS' : 'QUALITY_WARNING'
|
|
||||||
});
|
|
||||||
});"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## **7. Logging Implementation**
|
|
||||||
```bash
|
|
||||||
# === Logging Utilities ===
|
|
||||||
log_info() {
|
|
||||||
echo "[INFO] $(date '+%Y-%m-%d %H:%M:%S') $1" >&2
|
|
||||||
}
|
|
||||||
|
|
||||||
log_error() {
|
|
||||||
echo "[ERROR] $(date '+%Y-%m-%d %H:%M:%S') $1" >&2
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## **8. Quality Control Measures**
|
|
||||||
```bash
|
|
||||||
# === Quality Control ===
|
|
||||||
validate_environment() {
|
validate_environment() {
|
||||||
# Verify all dependencies exist
|
# Verify all dependencies exist
|
||||||
local missing=()
|
local missing=()
|
||||||
@@ -189,124 +44,88 @@ validate_environment() {
|
|||||||
|
|
||||||
if (( ${#missing[@]} > 0 )); then
|
if (( ${#missing[@]} > 0 )); then
|
||||||
log_error "Missing dependencies: ${missing[*]}"
|
log_error "Missing dependencies: ${missing[*]}"
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
get_previous_document_count() {
|
|
||||||
# Get document count from previous successful run
|
|
||||||
# Args:
|
|
||||||
# $1 - Collection name
|
|
||||||
mongosh "$MONGO_URI" --quiet --eval "
|
|
||||||
db.${1}.countDocuments()"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
## **9. Main Execution Flow**
|
|
||||||
```bash
|
|
||||||
# === Main Execution ===
|
|
||||||
main() {
|
|
||||||
validate_environment
|
|
||||||
readonly PIPELINE_VERSION="1.0.0" # Semantic versioning
|
|
||||||
|
|
||||||
if ! execute_data_pipeline; then
|
|
||||||
log_error "Pipeline execution failed"
|
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Verify MongoDB connection
|
||||||
|
if ! mongosh "$MONGO_URI" --eval "db.version()" >/dev/null; then
|
||||||
|
log_error "Cannot connect to MongoDB at $MONGO_URI"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
## === Logging Utilities ===
|
||||||
|
log() {
|
||||||
|
local level="$1"
|
||||||
|
local message="$2"
|
||||||
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] [${level}] ${message}" >&2
|
||||||
|
}
|
||||||
|
|
||||||
|
log_info() {
|
||||||
|
if $VERBOSE || [[ "$1" != "DEBUG" ]]; then
|
||||||
|
log "$@"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
log_error() {
|
||||||
|
log "ERROR" "$1"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
handle_error() {
|
||||||
|
local line="$1"
|
||||||
|
log_error "Error occurred at line $line"
|
||||||
|
}
|
||||||
|
|
||||||
|
## === Data Transformation ===
|
||||||
|
transform_to_json() {
|
||||||
|
local schema_version="$1"
|
||||||
|
xq -c --arg schema_version "$schema_version" '
|
||||||
|
.Opportunities.Opportunity[] |
|
||||||
|
._metadata = {
|
||||||
|
ingestion_date: now|todateiso8601,
|
||||||
|
schema_version: $schema_version,
|
||||||
|
content_sha256: env.CONTENT_SHA256,
|
||||||
|
pipeline_version: env.VERSION
|
||||||
|
}'
|
||||||
|
}
|
||||||
|
|
||||||
|
## === Main Pipeline ===
|
||||||
|
execute_data_pipeline() {
|
||||||
|
local current_date=$(date +%Y%m%d)
|
||||||
|
local collection_name="${COLLECTION_PREFIX}_$(date +%Y%m)"
|
||||||
|
local schema_version=$(get_schema_version)
|
||||||
|
|
||||||
|
log_info "INFO" "Starting ingestion for ${current_date} (schema ${schema_version})"
|
||||||
|
|
||||||
|
# Establish quality baseline
|
||||||
|
local baseline_count=$(get_previous_document_count "$collection_name")
|
||||||
|
log_info "INFO" "Baseline document count: ${baseline_count}"
|
||||||
|
|
||||||
|
# Execute streaming pipeline
|
||||||
|
curl -LfsS "${BASE_URL}/GrantsDBExtract${current_date}v2.zip" |
|
||||||
|
tee >(compute_content_hash) |
|
||||||
|
bsdtar -xOf - '*.xml' |
|
||||||
|
validate_xml_stream |
|
||||||
|
transform_to_json "$schema_version" |
|
||||||
|
import_to_mongodb "$collection_name" "$baseline_count"
|
||||||
|
|
||||||
|
log_info "INFO" "Pipeline completed successfully"
|
||||||
|
}
|
||||||
|
|
||||||
|
## === Entry Point ===
|
||||||
|
main() {
|
||||||
|
# Parse command-line arguments
|
||||||
|
while getopts ":v" opt; do
|
||||||
|
case $opt in
|
||||||
|
v) VERBOSE=true;;
|
||||||
|
\?) log_error "Invalid option: -$OPTARG";;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
validate_environment
|
||||||
|
execute_data_pipeline
|
||||||
exit 0
|
exit 0
|
||||||
}
|
}
|
||||||
|
|
||||||
main "$@"
|
main "$@"
|
||||||
```
|
|
||||||
|
|
||||||
## **Key Improvements for Karen**
|
|
||||||
|
|
||||||
1. **Professional Tone**
|
|
||||||
- Removed all informal comments/jokes
|
|
||||||
- Added proper documentation headers
|
|
||||||
- Semantic versioning for the pipeline
|
|
||||||
|
|
||||||
2. **Explicit Error Handling**
|
|
||||||
- Separate logging functions
|
|
||||||
- Early dependency validation
|
|
||||||
- Clear exit codes
|
|
||||||
|
|
||||||
3. **Audit Trail**
|
|
||||||
- Complete metadata tracking
|
|
||||||
- Cryptographic content hashing
|
|
||||||
- Quality threshold documentation
|
|
||||||
|
|
||||||
4. **Enterprise Patterns**
|
|
||||||
- Configurable via environment variables
|
|
||||||
- Proper batch processing
|
|
||||||
- Schema version awareness
|
|
||||||
|
|
||||||
5. **Maintainability**
|
|
||||||
- Modular function design
|
|
||||||
- Clear separation of concerns
|
|
||||||
- Documented dependencies
|
|
||||||
|
|
||||||
This version meets the highest standards for:
|
|
||||||
- Production reliability
|
|
||||||
- Maintainability
|
|
||||||
- Audit compliance
|
|
||||||
- Operational visibility
|
|
||||||
|
|
||||||
Would you like me to add any specific enterprise features like:
|
|
||||||
- Slack/Teams notifications
|
|
||||||
- Prometheus metrics export
|
|
||||||
- Airflow/DAG integration
|
|
||||||
- Detailed unit test scaffolding?
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
For a script of this caliber—handling mission-critical government data with enterprise-grade validation—the name should be:
|
|
||||||
|
|
||||||
### **Final Recommendation:**
|
|
||||||
**`grants-xml-sync`**
|
|
||||||
|
|
||||||
### **Why This Name?**
|
|
||||||
1. **Clear Purpose** – Instantly communicates it syncs XML grants data.
|
|
||||||
2. **Precision** – Specifies the data type (`XML`) and source (`grants`).
|
|
||||||
3. **Unix Convention** – Follows the `[tool]-[format]-[action]` pattern (e.g., `aws-s3-copy`, `kafka-log-export`).
|
|
||||||
4. **Avoids Ambiguity** – No vague terms like `manager`, `handler`, or `tool`.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### **Alternatives Considered (and Rejected)**
|
|
||||||
| Name | Problem |
|
|
||||||
|------|---------|
|
|
||||||
| `fetch_grants.sh` | Too generic; doesn’t specify XML |
|
|
||||||
| `grants_gov_downloader` | Verbose; redundant ("gov" is implied) |
|
|
||||||
| `grants-xml-validator` | Overemphasizes validation over sync |
|
|
||||||
| `update_grants_db` | Misleading (it doesn’t modify a DB) |
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### **Filesystem Placement**
|
|
||||||
Deploy it like a first-class tool:
|
|
||||||
```bash
|
|
||||||
sudo install -m 755 grants-xml-sync /usr/local/bin/
|
|
||||||
```
|
|
||||||
**Logs & Data**:
|
|
||||||
```bash
|
|
||||||
/var/log/grants-xml-sync.json.log # Structured JSON logs
|
|
||||||
/data/grants/ # Downloaded extracts
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### **Bonus: Systemd Service Name**
|
|
||||||
If you later automate it as a service:
|
|
||||||
**`grants-xml-sync.service`**
|
|
||||||
```ini
|
|
||||||
[Unit]
|
|
||||||
Description=Grants.gov XML Sync Service
|
|
||||||
|
|
||||||
[Service]
|
|
||||||
ExecStart=/usr/local/bin/grants-xml-sync
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
This naming ensures **clarity, maintainability, and professionalism**—critical for a script of this importance.
|
|
||||||
Reference in New Issue
Block a user