From 114346686fbed4b26fb1d483dd328a39a62e7563 Mon Sep 17 00:00:00 2001 From: Nick Bowler Date: Tue, 15 Feb 2022 01:04:17 -0500 Subject: [PATCH] Implement the -a and -v options in join.awk. So now users of DX_PROG_JOIN can use these options. Two is better than none! --- m4/join.m4 | 12 ++- scripts/join.awk | 132 +++++++++++++++++++++++++---- tests/scripts.at | 215 ++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 340 insertions(+), 19 deletions(-) diff --git a/m4/join.m4 b/m4/join.m4 index 2f25ce8..a0a8d10 100644 --- a/m4/join.m4 +++ b/m4/join.m4 @@ -10,7 +10,15 @@ dnl Search PATH for standard POSIX "join" utility. If found, the JOIN dnl variable (which is substituted by AC_SUBST) is set to the result and dnl the cache variable dx_cv_join_works is set to "yes". Otherwise, dnl dx_cv_join_works is set to "no" and JOIN is set to an incomplete -dnl awk-based implementation which supports no options. +dnl awk-based implementation. +dnl +dnl The awk replacement currently has the following limitations: +dnl +dnl - Only the -v and -a options are supported. +dnl +dnl - Due to limitations of some awk implementations, a filename of "-" +dnl to mean standard input should be avoided for the second file name. +dnl Nevertheless, using "-" for the first filename should be portable. AC_DEFUN([DX_PROG_JOIN], [AC_PREREQ([2.62])dnl AC_REQUIRE([AC_PROG_AWK])dnl @@ -34,7 +42,7 @@ AS_IF([test x"$dx_cv_join_works" != x"yes"], [JOIN="$AWK -f m4_do( [m4_pushdef([m4_include], [$][1])], [m4_include(DX_BASEDIR[/scripts/join.awk])], - [m4_popdef([m4_include])])"])]) + [m4_popdef([m4_include])]) --"])]) AC_DEFUN([_DX_JOIN_DO_TEST], [AS_IF([test ! -f conftest.b], diff --git a/scripts/join.awk b/scripts/join.awk index 548ca2c..7187b50 100755 --- a/scripts/join.awk +++ b/scripts/join.awk @@ -2,43 +2,115 @@ # # Copyright © 2022 Nick Bowler # -# Partial implementation of POSIX "join" command. No options are supported. +# Partial implementation of POSIX "join" command: only the "-v" and "-a" +# options are implemented. +# +# Not all awk implementations support reading from standard input with the +# getline function by specifying a filename of "-". In particular, busybox +# awk will read from a file named "-" instead of standard input. Since +# busybox-based environments are typically missing "join", this limitation +# is problematic. As a workaround, do not use "-" for the second input +# filename. # # License WTFPL2: Do What The Fuck You Want To Public License, version 2. # This is free software: you are free to do what the fuck you want to. # There is NO WARRANTY, to the extent permitted by law. BEGIN { - if (ARGC != 3) { + show_uniq_lhs = 0 + shoq_uniq_rhs = 0 + show_common = 1 + + # Process command-line options + for (i = 1; i < ARGC; i++) { + if (substr(ARGV[i], 1, 1) != "-" || ARGV[i] == "-") + break; + + opt = substr(ARGV[i], 2, 1); + if (opt == "a" || opt == "v") { + num = substr(ARGV[i], 3, 1); + if (num == "") { + # option argument must be next on command-line + ARGV[i++] = ""; + num = ARGV[i]; + } + + if (opt == "v") { + show_common = 0; + } + + if (num == 1) { + show_uniq_lhs = 1; + } else if (num == 2) { + show_uniq_rhs = 1; + } else { + # invalid argument + exit 1; + } + } else { + # unsupported option + exit 1; + } + + ARGV[i] = ""; + } + + if (i+2 != ARGC) { + # invalid usage exit 1; } - file2 = ARGV[2]; - delete ARGV[2]; + file2 = ARGV[i+1]; + ARGV[i+1] = ""; - advance_rhs(); + rhs_max_nr = rhs_nr = 0; + if (advance_rhs() == 0) + finish_rhs(); } +{ $1 = $1 } $1 == lhs_prev { - # Rewind RHS as we have duplicate keys in LHS. + # Rewind RHS as we have duplicate common keys in LHS. close(file2); + + rhs_nr = 0; advance_rhs(); } -$1 < rhs[1] { next } +$1 < rhs[1] { + if (show_uniq_lhs) { + print; + } + next; +} + { while ($1 > rhs[1]) { + if (show_uniq_rhs && rhs_nr == rhs_max_nr) + print_rhs(); + if (advance_rhs() == 0) - exit(0); + finish_rhs(); + + if (show_uniq_lhs && $1 < rhs[1]) + print; } } -$1 == rhs[1] { - lhs_prev = $1 = $1; +!rhs_eof && $1 == rhs[1] { + lhs_prev = $1; do { - print_match(); + if (show_common) print_match(); advance_rhs(); - } while ($1 == rhs[1]); + } while (!rhs_eof && $1 == rhs[1]); +} + +END { + if (show_uniq_rhs) { + do { + print_rhs(); + } while (advance_rhs() > 0); + } } function advance_rhs(raw, rc) @@ -47,15 +119,43 @@ function advance_rhs(raw, rc) if (rc < 0) exit(1); + rhs_eof = rc == 0; + + if (rhs_max_nr == rhs_nr++) + rhs_max_nr++; + split(raw, rhs); return rc; } +function finish_rhs(rc) +{ + rc = 0; + if (show_uniq_lhs) { + do { + if (NR > 0) { + $1 = $1; + print; + } + } while ((rc = getline) > 0); + } + exit(-rc); +} + +function print_rhs(i) +{ + if (!rhs_eof) { + if (i < 1) + printf "%s", rhs[1]; + for (i = 2; i in rhs; i++) { + printf " %s", rhs[i]; + } + print ""; + } +} + function print_match(i) { printf "%s", $0 - for (i = 2; i in rhs; i++) { - printf " %s", rhs[i]; - } - print "" + print_rhs(2); } diff --git a/tests/scripts.at b/tests/scripts.at index f807650..b349e7f 100644 --- a/tests/scripts.at +++ b/tests/scripts.at @@ -1,4 +1,4 @@ -dnl Copyright © 2021 Nick Bowler +dnl Copyright © 2021-2022 Nick Bowler dnl dnl License WTFPL2: Do What The Fuck You Want To Public License, version 2. dnl This is free software: you are free to do what the fuck you want to. @@ -365,3 +365,216 @@ cp tree.def expout AT_CHECK([$CC -o test0$EXEEXT test0.c && ./test0$EXEEXT], [0], [expout]) AT_CLEANUP + +AT_SETUP([join.awk]) + +JOIN="$AWK -f $builddir/scripts/join.awk --" + +AT_DATA([a], +[[1 a +3 a1 x +3 a2 x +5 a +6 a +8 a1 x +8 a2 x +9 a1 +9 a2 +9 a3 +]]) + +AT_DATA([b], +[[2 b +2 b +3 b y +4 b +6 b1 y +6 b2 y +7 b +8 b1 y +8 b2 y +]]) + +AT_CHECK([$JOIN a b], [0], +[[3 a1 x b y +3 a2 x b y +6 a b1 y +6 a b2 y +8 a1 x b1 y +8 a1 x b2 y +8 a2 x b1 y +8 a2 x b2 y +]]) + +AT_CHECK([$JOIN -v1 a b], [0], +[[1 a +5 a +9 a1 +9 a2 +9 a3 +]]) + +AT_CHECK([$JOIN -v2 a b], [0], +[[2 b +2 b +4 b +7 b +]]) + +AT_CHECK([$JOIN -v1 -v2 a b], [0], +[[1 a +2 b +2 b +4 b +5 a +7 b +9 a1 +9 a2 +9 a3 +]]) + +AT_CHECK([$JOIN -a1 a b], [0], +[[1 a +3 a1 x b y +3 a2 x b y +5 a +6 a b1 y +6 a b2 y +8 a1 x b1 y +8 a1 x b2 y +8 a2 x b1 y +8 a2 x b2 y +9 a1 +9 a2 +9 a3 +]]) + +AT_CHECK([$JOIN -a2 a b], [0], +[[2 b +2 b +3 a1 x b y +3 a2 x b y +4 b +6 a b1 y +6 a b2 y +7 b +8 a1 x b1 y +8 a1 x b2 y +8 a2 x b1 y +8 a2 x b2 y +]]) + +AT_CHECK([$JOIN -a1 -a2 a b], [0], +[[1 a +2 b +2 b +3 a1 x b y +3 a2 x b y +4 b +5 a +6 a b1 y +6 a b2 y +7 b +8 a1 x b1 y +8 a1 x b2 y +8 a2 x b1 y +8 a2 x b2 y +9 a1 +9 a2 +9 a3 +]]) + +AT_CHECK([$JOIN b a], [0], +[[3 b y a1 x +3 b y a2 x +6 b1 y a +6 b2 y a +8 b1 y a1 x +8 b1 y a2 x +8 b2 y a1 x +8 b2 y a2 x +]]) + +AT_CHECK([$JOIN -v1 b a], [0], +[[2 b +2 b +4 b +7 b +]]) + +AT_CHECK([$JOIN -v2 b a], [0], +[[1 a +5 a +9 a1 +9 a2 +9 a3 +]]) + +AT_CHECK([$JOIN -v1 -v2 b a], [0], +[[1 a +2 b +2 b +4 b +5 a +7 b +9 a1 +9 a2 +9 a3 +]]) + +AT_CHECK([$JOIN -a1 b a], [0], +[[2 b +2 b +3 b y a1 x +3 b y a2 x +4 b +6 b1 y a +6 b2 y a +7 b +8 b1 y a1 x +8 b1 y a2 x +8 b2 y a1 x +8 b2 y a2 x +]]) + +AT_CHECK([$JOIN -a2 b a], [0], +[[1 a +3 b y a1 x +3 b y a2 x +5 a +6 b1 y a +6 b2 y a +8 b1 y a1 x +8 b1 y a2 x +8 b2 y a1 x +8 b2 y a2 x +9 a1 +9 a2 +9 a3 +]]) + +AT_CHECK([$JOIN -a1 -a2 b a], [0], +[[1 a +2 b +2 b +3 b y a1 x +3 b y a2 x +4 b +5 a +6 b1 y a +6 b2 y a +7 b +8 b1 y a1 x +8 b1 y a2 x +8 b2 y a1 x +8 b2 y a2 x +9 a1 +9 a2 +9 a3 +]]) + +AT_CHECK([echo wat | $JOIN -v1 - /dev/null], [0], +[[wat +]]) + +AT_CLEANUP -- 2.43.2