From ba8d7ca67d856a16efb25d5f174cde4886504eee Mon Sep 17 00:00:00 2001 From: medusa Date: Tue, 1 Jul 2025 12:58:26 +0000 Subject: [PATCH] Add tech_docs/linux/awk.md --- tech_docs/linux/awk.md | 246 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 246 insertions(+) create mode 100644 tech_docs/linux/awk.md diff --git a/tech_docs/linux/awk.md b/tech_docs/linux/awk.md new file mode 100644 index 0000000..60305f7 --- /dev/null +++ b/tech_docs/linux/awk.md @@ -0,0 +1,246 @@ +# **The Ultimate AWK Guide: From Basics to Advanced Data Wrangling** + +## **Table of Contents** +1. [AWK Fundamentals](#1-awk-fundamentals) +2. [Patterns & Actions](#2-patterns--actions) +3. [Built-in Variables](#3-built-in-variables) +4. [Arrays & Data Structures](#4-arrays--data-structures) +5. [Control Flow](#5-control-flow) +6. [Functions & Math](#6-functions--math) +7. [Advanced Text Processing](#7-advanced-text-processing) +8. [Real-World Recipes](#8-real-world-recipes) +9. [Performance & Limitations](#9-performance--limitations) + +--- + +## **1. AWK Fundamentals** + +### **Core Syntax** +```bash +awk [OPTIONS] 'PATTERN {ACTION}' file.txt +``` + +### **Basic Structure** +```awk +BEGIN { /* pre-processing */ } +PATTERN { /* line processing */ } +END { /* post-processing */ } +``` + +### **Common Flags** +```bash +awk -F: '{print $1}' /etc/passwd # Set field separator +awk -v var=value '...' # Pass variables +awk -f script.awk file.txt # Use script file +``` + +--- + +## **2. Patterns & Actions** + +### **Pattern Types** +```awk +/error/ {print} # Regex match +$3 > 100 {print $1} # Field comparison +NR == 1 {print} # Line number +BEGINFILE {print "Processing:", FILENAME} # Per-file +``` + +### **Special Patterns** +```awk +BEGIN {FS=":"; OFS="\t"} # Set input/output separators +END {print "Total lines:", NR} # Final processing +``` + +--- + +## **3. Built-in Variables** + +| Variable | Description | +|----------|-------------| +| `NR` | Current record number | +| `NF` | Number of fields | +| `FS` | Field separator (default: whitespace) | +| `OFS` | Output field separator | +| `FILENAME` | Current file name | +| `FNR` | Record number per file | + +### **Example** +```awk +awk '{print NR, NF, $0}' file.txt # Show line stats +``` + +--- + +## **4. Arrays & Data Structures** + +### **Associative Arrays** +```awk +{count[$1]++} # Count occurrences +END {for (key in count) print key, count[key]} +``` + +### **Multi-Dimensional Arrays** +```awk +{array[$1,$2] = $3} # Fake multi-dim +``` + +### **Array Functions** +```awk +split(string, array, separator) +asort(array) # Sort by value +asorti(array) # Sort by index +``` + +--- + +## **5. Control Flow** + +### **Conditionals** +```awk +{if ($3 > 100) print "High:", $1 + else print "Low:", $1} +``` + +### **Loops** +```awk +for (i=1; i<=NF; i++) {print $i} # Fields +for (key in array) {print key} # Array keys +``` + +### **Switch/Case** +```awk +switch($1) { + case "foo": print "Found foo"; break + case /^bar/: print "Starts with bar"; break + default: print "Other" +} +``` + +--- + +## **6. Functions & Math** + +### **Built-in Functions** +```awk +length($0) # String length +sub(/old/, "new", $1) # In-field substitution +system("date") # Run shell command +``` + +### **Math Operations** +```awk +{sum += $3; sumsq += ($3)^2} +END {print "Mean:", sum/NR, "Std Dev:", sqrt(sumsq/NR - (sum/NR)^2)} +``` + +### **User Functions** +```awk +function double(x) {return x*2} +{d = double($1); print d} +``` + +--- + +## **7. Advanced Text Processing** + +### **Field Manipulation** +```awk +{$1 = toupper($1); $NF = $NF "%"} # Modify fields +``` + +### **Multi-Line Records** +```bash +awk -v RS="" '{print $1}' # Paragraph mode +``` + +### **CSV Processing** +```bash +awk -FPAT='([^,]+)|("[^"]+")' '{print $2}' data.csv +``` + +--- + +## **8. Real-World Recipes** + +### **Log Analysis** +```awk +# Top 10 frequent IPs in access.log +awk '{ip[$1]++} END {for (i in ip) print ip[i], i}' access.log | sort -nr | head +``` + +### **Data Transformation** +```awk +# Convert TSV to CSV +BEGIN {FS="\t"; OFS=","} {$1=$1; print} +``` + +### **Column Statistics** +```awk +# Compute column averages +NR>1 {for(i=1; i<=NF; i++) sum[i]+=$i} +END {for(i=1; i<=NF; i++) print "Col", i, "avg:", sum[i]/(NR-1)} +``` + +### **JSON Generation** +```awk +BEGIN {print "["; FS=","} +{printf " {\"name\":\"%s\",\"value\":%s}%s\n", $1, $2, (NR==FNR?",":"")} +END {print "]"} +``` + +--- + +## **9. Performance & Limitations** + +### **Optimization Tips** +```bash +LC_ALL=C awk ... # 2-3x speedup for ASCII +mawk (faster alternative to gawk) # For large datasets +``` + +### **When Not to Use AWK** +- Binary data processing +- Complex nested data structures (use `jq` for JSON) +- Multi-gigabyte files (consider `split + parallel`) + +### **AWK vs Alternatives** +| Task | Best Tool | +|------|-----------| +| Columnar data | AWK | +| JSON/XML | `jq`/`xq` | +| Complex stats | R/Python | +| Multi-file joins | SQLite | + +--- + +## **Pro Techniques** + +### **Self-Contained Scripts** +```bash +#!/usr/bin/awk -f +BEGIN {print "Starting processing"} +/pattern/ {count++} +END {print "Found", count, "matches"} +``` + +### **Two-File Processing** +```awk +# Join two files on first field +NR==FNR {a[$1]=$2; next} +$1 in a {print $0, a[$1]} +``` + +### **Bit Manipulation** +```awk +function is_set(x,bit) {return and(x, lshift(1, bit-1))} +``` + +--- + +## **Further Learning** +- **Books**: "Effective AWK Programming" (GNU AWK manual) +- **Cheat Sheets**: [awkcheatsheet.com](https://awkcheatsheet.com) +- **Practice**: [exercism.org/tracks/awk](https://exercism.org/tracks/awk) + +**Need an AWK solution?** Describe your data format and desired transformation! \ No newline at end of file