#!/usr/bin/env bash # # Test awk vs Python speed. # # On this hash table benchmark, Python is maybe 10% slower than gawk. mawk is # twice is fast as gawk (and bwk). # # Python has much more functionality, so it's not exactly a fair comparison, # but it's instructive. # # Update: simply adding tolower() makes gawk much slower than Python (555 ms # vs. 280 ms), and mawk is still much faster at 138 ms. # # Mawk is known to be fast? Faster than Java on this benchmark. # https://brenocon.com/blog/2009/09/dont-mawk-awk-the-fastest-and-most-elegant-big-data-munging-language/ # # Usage: # ./awk-python.sh set -o nounset set -o pipefail set -o errexitsetconst global FOO = "bar" readonly FILES = '('../*.sh ../*/*.sh ../*.py ../*/*.py ../*/*/*.py) # Test out hash table implementations # mawk is faster: 77ms vs 155ms for 10 iterations. proc test-awk { for awk in [gawk mawk] ~/git/bwk/bwk { echo --- echo $awk echo --- time for i in [{1..10}] { $awk ' { line = tolower($0) num_lines += 1 # NOTE: gawk has length(); mawk does not if (!(line in unique)) { num_unique += 1 } unique[line] += 1 } END { print "unique lines: " num_unique print "total lines: " num_lines } ' $(FILES[@]) } } } # Python VM is slower: 160-170 ms. Oops. # # Well Python has more general dictionaries -- they take more than strings. proc test-python { time for i in [{1..10}] { python -S -c ' import collections import sys num_lines = 0 num_unique = 0 unique = collections.defaultdict(int) for path in sys.argv[1:]: with open(path) as f: for line in f: line = line.lower() num_lines += 1 if line not in unique: num_unique += 1 unique[line] += 1 print "unique lines: ", num_unique print "total lines: ", num_lines ' $(FILES[@]) } } # Only 10-30 ms. We are doing real work. proc test-wc { time for i in [{1..10}] { cat $(FILES[@]) | wc -c } } proc files { echo $(FILES[@]) echo "$(#FILES[@]) files" } @Argv (CommandList children: [ (C {(set)} {(-o)} {(nounset)}) (C {(set)} {(-o)} {(pipefail)}) (C {(set)} {(-o)} {(errexit)}) (Assignment keyword: Assign_Readonly pairs: [ (assign_pair lhs: (LhsName name:FILES) op: Equal rhs: { (ArrayLiteralPart words: [ {(../) (Lit_Other "*") (.sh)} {(../) (Lit_Other "*") (/) (Lit_Other "*") (.sh)} {(../) (Lit_Other "*") (.py)} {(../) (Lit_Other "*") (/) (Lit_Other "*") (.py)} {(../) (Lit_Other "*") (/) (Lit_Other "*") (/) (Lit_Other "*") (.py)} ] ) } spids: [76] ) ] spids: [74] ) (FuncDef name: test-awk body: (BraceGroup children: [ (ForEach iter_name: awk iter_words: [{(gawk)} {(mawk)} {(TildeSubPart prefix:"") (/git/bwk/bwk)}] do_arg_iter: False body: (DoGroup children: [ (C {(echo)} {(---)}) (C {(echo)} {($ VSub_Name "$awk")}) (C {(echo)} {(---)}) (TimeBlock pipeline: (ForEach iter_name: i iter_words: [{(Lit_LBrace "{") (1..10) (Lit_RBrace "}")}] do_arg_iter: False body: (DoGroup children: [ (C {($ VSub_Name "$awk")} { (SQ <"\n"> <" { \n"> <" line = tolower($0)\n"> <" num_lines += 1\n"> <"\n"> <" # NOTE: gawk has length(); mawk does not\n"> <" if (!(line in unique)) {\n"> <" num_unique += 1\n"> <" }\n"> <" unique[line] += 1\n"> <" }\n"> <" END {\n"> <" print \"unique lines: \" num_unique\n"> <" print \"total lines: \" num_lines\n"> <" }\n"> <" "> ) } {(DQ (BracedVarSub token: bracket_op:(WholeArray op_id:Lit_At) spids:[191196]))} ) ] spids: [166 201] ) spids: [160 164] ) ) ] spids: [135 204] ) spids: [126 133] ) ] spids: [118] ) spids: [114 117] ) (FuncDef name: test-python body: (BraceGroup children: [ (TimeBlock pipeline: (ForEach iter_name: i iter_words: [{(Lit_LBrace "{") (1..10) (Lit_RBrace "}")}] do_arg_iter: False body: (DoGroup children: [ (C {(python)} {(-S)} {(-c)} { (SQ <"\n"> <"import collections\n"> <"import sys\n"> <"\n"> <"num_lines = 0\n"> <"num_unique = 0\n"> <"unique = collections.defaultdict(int)\n"> <"\n"> <"for path in sys.argv[1:]:\n"> <" with open(path) as f:\n"> <" for line in f:\n"> <" line = line.lower()\n"> <" num_lines += 1\n"> <"\n"> <" if line not in unique:\n"> <" num_unique += 1\n"> <" unique[line] += 1\n"> <"\n"> <"print \"unique lines: \", num_unique\n"> <"print \"total lines: \", num_lines\n"> <" "> ) } {(DQ (BracedVarSub token: bracket_op:(WholeArray op_id:Lit_At) spids:[272277]))} ) ] spids: [238 282] ) spids: [232 236] ) ) ] spids: [222] ) spids: [218 221] ) (FuncDef name: test-wc body: (BraceGroup children: [ (TimeBlock pipeline: (ForEach iter_name: i iter_words: [{(Lit_LBrace "{") (1..10) (Lit_RBrace "}")}] do_arg_iter: False body: (DoGroup children: [ (Pipeline children: [ (C {(cat)} { (DQ (BracedVarSub token: bracket_op: (WholeArray op_id:Lit_At) spids: [316 321] ) ) } ) (C {(wc)} {(-c)}) ] negated: False ) ] spids: [310 331] ) spids: [304 308] ) ) ] spids: [294] ) spids: [290 293] ) (FuncDef name: files body: (BraceGroup children: [ (C {(echo)} { (DQ (BracedVarSub token: bracket_op: (WholeArray op_id:Lit_At) spids: [346 351] ) ) } ) (C {(echo)} { (DQ (BracedVarSub token: prefix_op: VSub_Pound bracket_op: (WholeArray op_id:Lit_At) spids: [358 364] ) (" files") ) } ) ] spids: [340] ) spids: [336 339] ) (C {(DQ ($ VSub_At "$@"))}) ] )