#!/usr/bin/env bash
#
# Test awk vs Python speed.
#
# On this hash table benchmark, Python is maybe 10% slower than gawk.  mawk is
# twice is fast as gawk (and bwk).
#
# Python has much more functionality, so it's not exactly a fair comparison,
# but it's instructive.
#
# Update: simply adding tolower() makes gawk much slower than Python (555 ms
# vs. 280 ms), and mawk is still much faster at 138 ms.
#
# Mawk is known to be fast?  Faster than Java on this benchmark.
# https://brenocon.com/blog/2009/09/dont-mawk-awk-the-fastest-and-most-elegant-big-data-munging-language/
#
# Usage:
#   ./awk-python.sh <function name>

set -o nounset
set -o pipefail
set -o errexitsetconst global FOO = "bar"

readonly FILES = '('../*.sh ../*/*.sh ../*.py ../*/*.py ../*/*/*.py)

# Test out hash table implementations
# mawk is faster: 77ms vs 155ms for 10 iterations.
proc test-awk {
  for awk in [gawk mawk] ~/git/bwk/bwk {
    echo ---
    echo $awk
    echo ---
    time for i in [{1..10}] {
      $awk '
      { 
        line = tolower($0)
        num_lines += 1

        # NOTE: gawk has length(); mawk does not
        if (!(line in unique)) {
          num_unique += 1
        }
        unique[line] += 1
      }
      END {
        print "unique lines: " num_unique
        print "total lines: " num_lines
      }
      ' $(FILES[@])

    }
  }
}

# Python VM is slower: 160-170 ms.  Oops.
#
# Well Python has more general dictionaries -- they take more than strings.
proc test-python {
  time for i in [{1..10}] {
    python -S -c '
import collections
import sys

num_lines = 0
num_unique = 0
unique = collections.defaultdict(int)

for path in sys.argv[1:]:
  with open(path) as f:
    for line in f:
      line = line.lower()
      num_lines += 1

      if line not in unique:
        num_unique += 1
      unique[line] += 1

print "unique lines: ", num_unique
print "total lines: ", num_lines
      ' $(FILES[@])

  }
}

# Only 10-30 ms.  We are doing real work.
proc test-wc {
  time for i in [{1..10}] {
    cat $(FILES[@]) | wc -c
  }
}

proc files {
  echo $(FILES[@])
  echo "$(#FILES[@]) files"
}

@Argv
(CommandList
  children: [
    (C {(set)} {(-o)} {(nounset)})
    (C {(set)} {(-o)} {(pipefail)})
    (C {(set)} {(-o)} {(errexit)})
    (Assignment
      keyword: Assign_Readonly
      pairs: [
        (assign_pair
          lhs: (LhsName name:FILES)
          op: Equal
          rhs: 
            {
              (ArrayLiteralPart
                words: [
                  {(../) (Lit_Other "*") (.sh)}
                  {(../) (Lit_Other "*") (/) (Lit_Other "*") (.sh)}
                  {(../) (Lit_Other "*") (.py)}
                  {(../) (Lit_Other "*") (/) (Lit_Other "*") (.py)}
                  {(../) (Lit_Other "*") (/) (Lit_Other "*") (/) (Lit_Other "*") (.py)}
                ]
              )
            }
          spids: [76]
        )
      ]
      spids: [74]
    )
    (FuncDef
      name: test-awk
      body: 
        (BraceGroup
          children: [
            (ForEach
              iter_name: awk
              iter_words: [{(gawk)} {(mawk)} {(TildeSubPart prefix:"") (/git/bwk/bwk)}]
              do_arg_iter: False
              body: 
                (DoGroup
                  children: [
                    (C {(echo)} {(---)})
                    (C {(echo)} {($ VSub_Name "$awk")})
                    (C {(echo)} {(---)})
                    (TimeBlock
                      pipeline: 
                        (ForEach
                          iter_name: i
                          iter_words: [{(Lit_LBrace "{") (1..10) (Lit_RBrace "}")}]
                          do_arg_iter: False
                          body: 
                            (DoGroup
                              children: [
                                (C {($ VSub_Name "$awk")} 
                                  {
                                    (SQ <"\n"> <"      { \n"> <"        line = tolower($0)\n"> 
                                      <"        num_lines += 1\n"> <"\n"> <"        # NOTE: gawk has length(); mawk does not\n"> <"        if (!(line in unique)) {\n"> 
                                      <"          num_unique += 1\n"> <"        }\n"> <"        unique[line] += 1\n"> <"      }\n"> <"      END {\n"> 
                                      <"        print \"unique lines: \" num_unique\n"> <"        print \"total lines: \" num_lines\n"> <"      }\n"> <"      ">
                                    )
                                  } {(DQ (BracedVarSub token:<VSub_Name FILES> bracket_op:(WholeArray op_id:Lit_At) spids:[191196]))}
                                )
                              ]
                              spids: [166 201]
                            )
                          spids: [160 164]
                        )
                    )
                  ]
                  spids: [135 204]
                )
              spids: [126 133]
            )
          ]
          spids: [118]
        )
      spids: [114 117]
    )
    (FuncDef
      name: test-python
      body: 
        (BraceGroup
          children: [
            (TimeBlock
              pipeline: 
                (ForEach
                  iter_name: i
                  iter_words: [{(Lit_LBrace "{") (1..10) (Lit_RBrace "}")}]
                  do_arg_iter: False
                  body: 
                    (DoGroup
                      children: [
                        (C {(python)} {(-S)} {(-c)} 
                          {
                            (SQ <"\n"> <"import collections\n"> <"import sys\n"> <"\n"> 
                              <"num_lines = 0\n"> <"num_unique = 0\n"> <"unique = collections.defaultdict(int)\n"> <"\n"> <"for path in sys.argv[1:]:\n"> 
                              <"  with open(path) as f:\n"> <"    for line in f:\n"> <"      line = line.lower()\n"> <"      num_lines += 1\n"> <"\n"> 
                              <"      if line not in unique:\n"> <"        num_unique += 1\n"> <"      unique[line] += 1\n"> <"\n"> 
                              <"print \"unique lines: \", num_unique\n"> <"print \"total lines: \", num_lines\n"> <"      ">
                            )
                          } {(DQ (BracedVarSub token:<VSub_Name FILES> bracket_op:(WholeArray op_id:Lit_At) spids:[272277]))}
                        )
                      ]
                      spids: [238 282]
                    )
                  spids: [232 236]
                )
            )
          ]
          spids: [222]
        )
      spids: [218 221]
    )
    (FuncDef
      name: test-wc
      body: 
        (BraceGroup
          children: [
            (TimeBlock
              pipeline: 
                (ForEach
                  iter_name: i
                  iter_words: [{(Lit_LBrace "{") (1..10) (Lit_RBrace "}")}]
                  do_arg_iter: False
                  body: 
                    (DoGroup
                      children: [
                        (Pipeline
                          children: [
                            (C {(cat)} 
                              {
                                (DQ 
                                  (BracedVarSub
                                    token: <VSub_Name FILES>
                                    bracket_op: (WholeArray op_id:Lit_At)
                                    spids: [316 321]
                                  )
                                )
                              }
                            )
                            (C {(wc)} {(-c)})
                          ]
                          negated: False
                        )
                      ]
                      spids: [310 331]
                    )
                  spids: [304 308]
                )
            )
          ]
          spids: [294]
        )
      spids: [290 293]
    )
    (FuncDef
      name: files
      body: 
        (BraceGroup
          children: [
            (C {(echo)} 
              {
                (DQ 
                  (BracedVarSub
                    token: <VSub_Name FILES>
                    bracket_op: (WholeArray op_id:Lit_At)
                    spids: [346 351]
                  )
                )
              }
            )
            (C {(echo)} 
              {
                (DQ 
                  (BracedVarSub
                    token: <VSub_Name FILES>
                    prefix_op: VSub_Pound
                    bracket_op: (WholeArray op_id:Lit_At)
                    spids: [358 364]
                  ) (" files")
                )
              }
            )
          ]
          spids: [340]
        )
      spids: [336 339]
    )
    (C {(DQ ($ VSub_At "$@"))})
  ]
)