# Awk script to format tabular data into neat left aligned columns to improve # readibility. It is capable of inserting simple latex table codes. It # automagically right aligns numerical fields on their decimal points. It is # also possible to specify a printf formatter for each or some fields on the # first line. This is only used to format the the ascii data, not to translate # in a appropriate tabular-command. Tested with GNU Awk 3.0.3 # # Dec 1999 - Jan 2000, Erik Janssen, ejanssen@itmatters.nl # # # Features # -------- # It is possible to both format new tables and reformat existing tables. For # reformatting an existing table the old formatting has to be removed, this # feature is only tested for basic latex tables. # # Commandline # ----------- # (g)awk [-v mode=x] [-v line=y] -f mktable.awk your_file # (g)awk [-v mode=x] [-v line=y] -f mktable.awk - # # x: bare (default), latex # y: 1 or 2 # # example: gawk -v mode=latex -v line=2 -f mktable.awk your_file # # This makes latex source for a box with double lines # # Input data # ---------- # Supply a file with the following format, items in square braces are optional # # [/* format-specifier */] # start_delim field_data [delim field_data]* end_delim # # format-specifier: # | printf_specifier # | format_specifier ',' printf_specifier # ; # # printf_specifier A format specifier as you would supply it to printf. If a # certain column has no printf_specifier that column will be # left-aligned and as wide as the widest element. If a column # only has numeric (or empty) data, the column is right # aligned on the '.' # # field_data: Anything. When more field_data than printf_specifiers the # last ones go unformatted. No problem if there's more # printf_specifiers than field_data. There may be no '|' in # the field_data. # # start_delim: Start delimiter of old formatting # delim: Delimiter in old formatting # end_delim: End delimiter of old formatting # # Limitations # ----------- # It stores the table data in memory so you must have enough of that BEGIN { if (mode == "") { mode = "bare" } first_delimiter["bare"] = "" # table row start, search expr first_delimiter_out["bare"] = "" # table row start, output string delimiter["bare"] = " " # fieldsep, search expr delimiter_out["bare"] = " " # fieldsep, output string end_delimiter["bare"] = "" # table row end, search expr end_delimiter_out["bare"] = "" # table row end, output string first_delimiter["latex"] = "" first_delimiter_out["latex"] = "" delimiter["latex"] = "[ \t]*&[ \t]*" delimiter_out["latex"] = " & " end_delimiter["latex"] = " \\\\\\\\" end_delimiter_out["latex"] = " \\\\" for (i = 0; i < line; i++) { end_delimiter["latex"] = end_delimiter["latex"] " \\\\hline" end_delimiter_out["latex"] = end_delimiter_out["latex"]" \\hline" } oldFS = " " TYPE_UNKNOWN = 0 NON_NUMERIC = 1 NUMERIC = 2 } # If the linenumber is 1, scan optional formatters NR == 1 { if (($1 ~ /\/\*/) && ($NF ~ /\*\//)) { s = substr( $0, 3 ) s = substr( s, 1, length(s)-2 ) num_of_fields = split( s, printf_specifiers, "," ) print "(debug) Read format" next } } # Skip optional tabular line in latex mode $0 ~ /\\begin{tabular}/ && mode == "latex" { next } $0 ~ /\\end{tabular}/ && mode == "latex" { next } # Remove delimiters from previous formatting. This feature not tested very # well { if (end_delimiter[mode] !~ /^[ \t]*$/) gsub( end_delimiter[mode], "" ) if (first_delimiter[mode] !~ /^[ \t]*$/) gsub( first_delimiter[mode],"") } # Remove empty lines $0 ~ /^[ \t]*$/ { next } # Check whether this table has spaces or delimiter[mode] as field seperator. # Intermixed is not allowed. Adjust FS to this for this line. { FS = determine_FS( $0 ) $0 = $0 } # Keep track of the longest record found { if (NF > max_num_of_fields) { max_num_of_fields = NF } } # Determine column width and numeric status of columns without printf_specifier # and store this row { i = num_of_fields+1 while (i <= NF) { # Determine length for any column if (length($i) > max_field_len[i]) { max_field_len[i] = length($i) } # Now determine numeric status and optionally length of numeric column if (($i != "") && (isnumeric[i] != NON_NUMERIC)) { if ($i !~ /[0-9,]+(\.[0-9]+)*/) { isnumeric[i] = NON_NUMERIC } else { n = split( $i, temp, ".") if (n <= 2) { if (length(temp[1]) > numeric_width[i,"before"]) { # print "before" length(temp[1]) " " temp[1] numeric_width[i,"before"] = length(temp[1]) } if (length(temp[2]) > numeric_width[i,"after"]) { # print "after" length(temp[2]) " " temp[2] numeric_width[i,"after"] = length(temp[2]) } isnumeric[i] = NUMERIC } else isnumeric[i] = NON_NUMERIC } } i++ } row[ ++row_count ] = $0 } # Now print the table END { # Print header first, the header may be of type "latex" or "bare" if (mode == "latex") { printf("\\begin{tabular}{") if (line >= 1) printf("|") if (line == 2) printf("|") for (i = 0; i < max_num_of_fields; i++) { printf( (isnumeric[i] == NUMERIC) ? "r" : "l" ) if (line >= 1) printf("|") if (line == 2) printf("|") } print "}" } # Print body for (i = 1; i <= row_count; i++) { field_count = split( row[i], fields, determine_FS(row[i])) printf( first_delimiter[mode] ) for (field = 1; field <= max_num_of_fields; field++) { if (field > field_count) { # This line has less fields than the longest line printf( replicate(" ",max_field_len[field]) ) } else { if (printf_specifiers[field] != "") { # This is a column with specifier printf( printf_specifiers[field], fields[field] ) } else { # This is a column without specifier if (isnumeric[field] == NUMERIC) { # Numeric columns n = split(fields[field], temp, ".") printf( "%s%s%s%s%s", replicate(" ",numeric_width[field,"before"]-length(temp[1])), temp[1], n == 2 ? "." : (numeric_width[field,"after"]==0 ? "" : " "), temp[2], replicate(" ",numeric_width[field,"after"]-length(temp[2]))) } else { # Non-numeric columns printf( "%s%s", substr(fields[field], 1, max_field_len[field]), replicate(" ", max_field_len[field]-length(fields[field])) ) } } } if (field < max_num_of_fields) printf( delimiter_out[mode] ) } printf( end_delimiter_out[mode] ) printf("\n") } # Print footer if (mode == "latex") { print "\\end{tabular}" } } function replicate( s,n, r ) # repeat s n times { r = "" if ((n = int(n)) > 0) while (n--) r = r s return r } function alltrim( s ) # remove leading and trailing spaces { sub(s,/^[ \t]*/,"") sub(s,/[ \t]*$/,"") return s } function determine_FS( s ) { # It uses global var oldFS if (delimiter[mode] !~ /^[ \t]*$/) { if (s ~ delimiter[mode]) { return delimiter[mode] } return oldFS } return oldFS }