X-Git-Url: https://git.draconx.ca/gitweb/dxcommon.git/blobdiff_plain/5865ffc2d8b828aa78235fbdcf9dfe18a8799980..203a69fab1ff61e958dfb88599fd5d7f24648abf:/scripts/join.awk diff --git a/scripts/join.awk b/scripts/join.awk index 548ca2c..7187b50 100755 --- a/scripts/join.awk +++ b/scripts/join.awk @@ -2,43 +2,115 @@ # # Copyright © 2022 Nick Bowler # -# Partial implementation of POSIX "join" command. No options are supported. +# Partial implementation of POSIX "join" command: only the "-v" and "-a" +# options are implemented. +# +# Not all awk implementations support reading from standard input with the +# getline function by specifying a filename of "-". In particular, busybox +# awk will read from a file named "-" instead of standard input. Since +# busybox-based environments are typically missing "join", this limitation +# is problematic. As a workaround, do not use "-" for the second input +# filename. # # License WTFPL2: Do What The Fuck You Want To Public License, version 2. # This is free software: you are free to do what the fuck you want to. # There is NO WARRANTY, to the extent permitted by law. BEGIN { - if (ARGC != 3) { + show_uniq_lhs = 0 + shoq_uniq_rhs = 0 + show_common = 1 + + # Process command-line options + for (i = 1; i < ARGC; i++) { + if (substr(ARGV[i], 1, 1) != "-" || ARGV[i] == "-") + break; + + opt = substr(ARGV[i], 2, 1); + if (opt == "a" || opt == "v") { + num = substr(ARGV[i], 3, 1); + if (num == "") { + # option argument must be next on command-line + ARGV[i++] = ""; + num = ARGV[i]; + } + + if (opt == "v") { + show_common = 0; + } + + if (num == 1) { + show_uniq_lhs = 1; + } else if (num == 2) { + show_uniq_rhs = 1; + } else { + # invalid argument + exit 1; + } + } else { + # unsupported option + exit 1; + } + + ARGV[i] = ""; + } + + if (i+2 != ARGC) { + # invalid usage exit 1; } - file2 = ARGV[2]; - delete ARGV[2]; + file2 = ARGV[i+1]; + ARGV[i+1] = ""; - advance_rhs(); + rhs_max_nr = rhs_nr = 0; + if (advance_rhs() == 0) + finish_rhs(); } +{ $1 = $1 } $1 == lhs_prev { - # Rewind RHS as we have duplicate keys in LHS. + # Rewind RHS as we have duplicate common keys in LHS. close(file2); + + rhs_nr = 0; advance_rhs(); } -$1 < rhs[1] { next } +$1 < rhs[1] { + if (show_uniq_lhs) { + print; + } + next; +} + { while ($1 > rhs[1]) { + if (show_uniq_rhs && rhs_nr == rhs_max_nr) + print_rhs(); + if (advance_rhs() == 0) - exit(0); + finish_rhs(); + + if (show_uniq_lhs && $1 < rhs[1]) + print; } } -$1 == rhs[1] { - lhs_prev = $1 = $1; +!rhs_eof && $1 == rhs[1] { + lhs_prev = $1; do { - print_match(); + if (show_common) print_match(); advance_rhs(); - } while ($1 == rhs[1]); + } while (!rhs_eof && $1 == rhs[1]); +} + +END { + if (show_uniq_rhs) { + do { + print_rhs(); + } while (advance_rhs() > 0); + } } function advance_rhs(raw, rc) @@ -47,15 +119,43 @@ function advance_rhs(raw, rc) if (rc < 0) exit(1); + rhs_eof = rc == 0; + + if (rhs_max_nr == rhs_nr++) + rhs_max_nr++; + split(raw, rhs); return rc; } +function finish_rhs(rc) +{ + rc = 0; + if (show_uniq_lhs) { + do { + if (NR > 0) { + $1 = $1; + print; + } + } while ((rc = getline) > 0); + } + exit(-rc); +} + +function print_rhs(i) +{ + if (!rhs_eof) { + if (i < 1) + printf "%s", rhs[1]; + for (i = 2; i in rhs; i++) { + printf " %s", rhs[i]; + } + print ""; + } +} + function print_match(i) { printf "%s", $0 - for (i = 2; i in rhs; i++) { - printf " %s", rhs[i]; - } - print "" + print_rhs(2); }