#
# Copyright © 2022 Nick Bowler
#
-# Partial implementation of POSIX "join" command. No options are supported.
+# Partial implementation of POSIX "join" command: only the "-v" and "-a"
+# options are implemented.
+#
+# Not all awk implementations support reading from standard input with the
+# getline function by specifying a filename of "-". In particular, busybox
+# awk will read from a file named "-" instead of standard input. Since
+# busybox-based environments are typically missing "join", this limitation
+# is problematic. As a workaround, do not use "-" for the second input
+# filename.
#
# License WTFPL2: Do What The Fuck You Want To Public License, version 2.
# This is free software: you are free to do what the fuck you want to.
# There is NO WARRANTY, to the extent permitted by law.
BEGIN {
- if (ARGC != 3) {
+ show_uniq_lhs = 0
+ shoq_uniq_rhs = 0
+ show_common = 1
+
+ # Process command-line options
+ for (i = 1; i < ARGC; i++) {
+ if (substr(ARGV[i], 1, 1) != "-" || ARGV[i] == "-")
+ break;
+
+ opt = substr(ARGV[i], 2, 1);
+ if (opt == "a" || opt == "v") {
+ num = substr(ARGV[i], 3, 1);
+ if (num == "") {
+ # option argument must be next on command-line
+ ARGV[i++] = "";
+ num = ARGV[i];
+ }
+
+ if (opt == "v") {
+ show_common = 0;
+ }
+
+ if (num == 1) {
+ show_uniq_lhs = 1;
+ } else if (num == 2) {
+ show_uniq_rhs = 1;
+ } else {
+ # invalid argument
+ exit 1;
+ }
+ } else {
+ # unsupported option
+ exit 1;
+ }
+
+ ARGV[i] = "";
+ }
+
+ if (i+2 != ARGC) {
+ # invalid usage
exit 1;
}
- file2 = ARGV[2];
- delete ARGV[2];
+ file2 = ARGV[i+1];
+ ARGV[i+1] = "";
- advance_rhs();
+ rhs_max_nr = rhs_nr = 0;
+ if (advance_rhs() == 0)
+ finish_rhs();
}
+{ $1 = $1 }
$1 == lhs_prev {
- # Rewind RHS as we have duplicate keys in LHS.
+ # Rewind RHS as we have duplicate common keys in LHS.
close(file2);
+
+ rhs_nr = 0;
advance_rhs();
}
-$1 < rhs[1] { next }
+$1 < rhs[1] {
+ if (show_uniq_lhs) {
+ print;
+ }
+ next;
+}
+
{
while ($1 > rhs[1]) {
+ if (show_uniq_rhs && rhs_nr == rhs_max_nr)
+ print_rhs();
+
if (advance_rhs() == 0)
- exit(0);
+ finish_rhs();
+
+ if (show_uniq_lhs && $1 < rhs[1])
+ print;
}
}
-$1 == rhs[1] {
- lhs_prev = $1 = $1;
+!rhs_eof && $1 == rhs[1] {
+ lhs_prev = $1;
do {
- print_match();
+ if (show_common) print_match();
advance_rhs();
- } while ($1 == rhs[1]);
+ } while (!rhs_eof && $1 == rhs[1]);
+}
+
+END {
+ if (show_uniq_rhs) {
+ do {
+ print_rhs();
+ } while (advance_rhs() > 0);
+ }
}
function advance_rhs(raw, rc)
if (rc < 0)
exit(1);
+ rhs_eof = rc == 0;
+
+ if (rhs_max_nr == rhs_nr++)
+ rhs_max_nr++;
+
split(raw, rhs);
return rc;
}
+function finish_rhs(rc)
+{
+ rc = 0;
+ if (show_uniq_lhs) {
+ do {
+ if (NR > 0) {
+ $1 = $1;
+ print;
+ }
+ } while ((rc = getline) > 0);
+ }
+ exit(-rc);
+}
+
+function print_rhs(i)
+{
+ if (!rhs_eof) {
+ if (i < 1)
+ printf "%s", rhs[1];
+ for (i = 2; i in rhs; i++) {
+ printf " %s", rhs[i];
+ }
+ print "";
+ }
+}
+
function print_match(i)
{
printf "%s", $0
- for (i = 2; i in rhs; i++) {
- printf " %s", rhs[i];
- }
- print ""
+ print_rhs(2);
}