#!/bin/sh

# Copyright 2013 MediaMobil Communication GmbH
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# This script converts a binary .db file into a .csv file.
# The .db file was generated by darkstat with the --export option.
# The .csv file shall be read by any spreadsheet application.
SCRIPTNAME=$( basename $0)
if test -z "$( type -P awk )" ; then
  echo "${SCRIPTNAME}: missing AWK interpreter, at least not found in PATH"
  echo "${SCRIPTNAME}: every POSIX compliant OS has one; add the location to PATH"
  exit 1
fi
if test -z "$( type -P od )" ; then
  echo "${SCRIPTNAME}: missing od file dump tool, at least not found in PATH"
  echo "${SCRIPTNAME}: every POSIX compliant OS has one; add the location to PATH"
  exit 1
fi
if test $# -ne 1; then
  echo "${SCRIPTNAME}: missing parameter; need file name of .db file"
  exit 1
fi
DBFILENAME=$1
if test -r ${DBFILENAME}; then
  echo ${SCRIPTNAME}: Found file ${DBFILENAME}
else
  echo ${SCRIPTNAME}: file ${DBFILENAME} does not exist
  exit 1
fi
CSVFILENAME=${DBFILENAME%%.*}.csv
echo ${SCRIPTNAME}: Writing output into ${CSVFILENAME}

# The spec of the .db export format exists for different versions:
#   https://github.com/emikulic/darkstat/blob/master/export-format.txt
#   http://git.msquadrat.de/darkstat.git/blob_plain/master:/export-format.txt
#   http://phil.lavin.me.uk/downloads/parse.phps
# Only file format version 1 is supported by us.
# Obviously, darkstat itself distinguishes 3 different host format versions.
# Only host format version 2 is supported by us.
# The darkstat database file is converted from binary format
# to ASCII by the standard Unix command od.

# Some things don't work correctly yet.
# Probably because there is no DNS server configured in our embedded device
# that produces .db files within OpenWRT.
#   - host name contains nonsense at constant length 5
#   - "last seen" timing information contains always 0
#   - we read the graphics section of the file but ignore it

# Let the od tool convert each binary byte into several textual formats.
# The AWK script reads all variants and later picks the format it needs.
od  -Ad -v -tx1 -tu1 -ta -w1 < ${DBFILENAME} |
awk '
  NF==2                    { addr = 0 + $1; hex[addr] = $2; next }
  NF==1 &&   addr in dec   { ascii[addr]=$1;                next }
  NF==1 && ! (addr in dec) { dec[addr]=$1;                  next }
  # Now all variants of the bytes are available in certain arrays.
  # The array indices cover the range 0 .. addr.

  function read_bytes(array, address, count,	retval, c) {
    retval=""
    for (c=0; c<count; c++)
      retval = retval array[address+c]
    return retval
  }
  function read_number(address, count,		retval, c) {
    retval=0
    for (c=0; c<count; c++)
      retval = retval*256 + dec[address+c]
    return retval
  }
  function read_text(address, count,		retval, c) {
    retval=""
    for (c=0; c<count; c++)
      retval = retval ascii[address+c]
    return retval
  }
  function quit(reason, terminate, retval) {
    if (length(reason) > 0)
      print reason
    if (terminate != 0) {
      # Any remaining bytes in the file shall be dumped.
      for (i=ai; i<=addr; i++)
        print i, hex[i], ascii[i]
      exit(retval)
    }
  }
  function readIPsection() {
    ip_protos_data=read_bytes(ascii, ai, 1)
    if (ip_protos_data != "P") 
      quit("expected ip_protos_data P, found " ip_protos_data, 1, 1)
    ai += 1
    ip_proto_count=read_number(ai, 1)
    ai += 1
    for (pi=0; pi<ip_proto_count; pi++) {
      ip_proto_type=read_number(ai, 1)
      ai += 1
      IPprotos = IPprotos " " ip_proto_type
      ip_proto_in  += read_number(ai, 8)
      ai += 8
      ip_proto_out += read_number(ai, 8)
      ai += 8
    }
  }
  function readTCPsection() {
    tcp_protos_data=read_bytes(ascii, ai, 1)
    if (tcp_protos_data != "T") 
      quit("expected tcp_protos_data T, found " tcp_protos_data, 1, 1)
    ai += 1
    tcp_proto_count=read_number(ai, 2)
    ai += 2
    for (ti=0; ti<tcp_proto_count; ti++) {
      tcp_proto_port=read_number(ai, 2)
      ai += 2
      TCPports = TCPports " " tcp_proto_port
      tcp_proto_syn=read_number(ai, 8)
      ai += 8
      tcp_proto_in  += read_number(ai, 8)
      ai += 8
      tcp_proto_out += read_number(ai, 8)
      ai += 8
      if (tcp_proto_port == 22) {
        ssh_in  += tcp_proto_in
        ssh_out += tcp_proto_out
      }
      if (tcp_proto_port == 3389) {
        rdp_in  += tcp_proto_in
        rdp_out += tcp_proto_out
      }
    }
  }
  function readUDPsection() {
    udp_protos_data=read_bytes(ascii, ai, 1)
    if (udp_protos_data != "U") 
      quit("expected udp_protos_data U, found " udp_protos_data, 1, 1)
    ai += 1
    udp_proto_count=read_number(ai, 2)
    ai += 2
    for (ui=0; ui<udp_proto_count; ui++) {
      udp_proto_port=read_number(ai, 2)
      ai += 2
      UDPports = UDPports " " udp_proto_port
      udp_proto_in  += read_number(ai, 8)
      ai += 8
      udp_proto_out += read_number(ai, 8)
      ai += 8
      if (udp_proto_port == 22) {
        ssh_in  += udp_proto_in
        ssh_out += udp_proto_out
      }
      if (udp_proto_port == 3389) {
        rdp_in  += udp_proto_in
        rdp_out += udp_proto_out
      }
    }
  }
  function readGraphsection(interval) {
    n_bars=read_number(ai++, 1)
    i_bars=read_number(ai++, 1)
    for (bi=0; bi<n_bars; bi++) {
      graph_bytes_in=read_number(ai, 8)
      ai += 8
      graph_bytes_out=read_number(ai, 8)
      ai += 8
    }
  }

  END {
    file_header=read_bytes(hex, 0, 4)
    if (file_header != "da314159")
      quit("input data is not an exported darkstat .db file, wrong header: " file_header, 1, 1)
    section_header=read_bytes(hex, 4, 3)
    if (section_header != "da4853")
      quit("section header da4853 expected: " section_header, 1, 1)
    db_version=read_bytes(hex, 7, 1)
    if (db_version != "01")
      quit("file format supported only in version 01", 1, 1)
    host_count=read_number(8, 4)
    ai=12
    # Print a header into the .csv file.
    printf("IP address;MAC address;host in bytes;host out bytes;IP protos;IP in bytes;IP out bytes;TCP port count;TCP in bytes;TCP out bytes;UDP port count;UDP in bytes;UDP out bytes;ssh in bytes;ssh out bytes;rdp in bytes;rdp out bytes;TCP ports;UDP ports\n")
    for (hi=1; hi<=host_count; hi++) {
      # Make sure all variables to be printed are initially empty.
      ip_address=mac_address=""
      host_bytes_in=host_bytes_out=ip_proto_in=ip_proto_out=tcp_proto_in=tcp_proto_out=udp_proto_in=udp_proto_out=ssh_in=ssh_out=rdp_in=rdp_out=0
      IPprotos=TCPports=UDPports=""
      tcp_proto_count=udp_proto_count=0
      host_header=read_bytes(hex, ai, 3)
      host_version=read_bytes(hex, ai+3, 1)
      ai += 4
      if (host_version == "02") {
        ip_address=read_number(ai+0,1) "." read_number(ai+1,1) "." read_number(ai+2,1) "." read_number(ai+3,1)
        ai += 4
        if ((host_version+0) > 1) {
          last_seen=read_number(ai, 4)
          # This value is always 0 in our files.
          ai += 4
        }
        mac_address=hex[ai+0] ":" hex[ai+1] ":" hex[ai+2] ":" hex[ai+3] ":" hex[ai+4] ":" hex[ai+5]
        ai += 6
        # Weird stuff: the host name should be read.
        # But there are only 5 bytes of nonsense.
        # The first byte should be the length counter, but it isnt.
        # The last byte is in fact a 0 byte.
        # Probably caused by the missing DNS server.
        # ignore 5 bytes with nonsense
        nonsense=read_text(ai, 5)
        ai += 5
        host_bytes_in=read_number(ai, 8)
        ai += 8
        host_bytes_out=read_number(ai, 8)
        ai += 8
        readIPsection()
        readTCPsection()
        readUDPsection()
      } else {
        quit("host format supported only in version 02: " host_version, 1, 1)
        #address_familiy=read_bytes(hex, ai, 1)
        #print "address familiy = " address_familiy
      }
      printf("\"%s\";\"%s\";%d;%d;%s;%d;%d;%d;%d;%d;%d;%d;%d;%d;%d;%d;%d;%s;%s\n",
              ip_address, mac_address, host_bytes_in, host_bytes_out,
              IPprotos, ip_proto_in, ip_proto_out,
              tcp_proto_count, tcp_proto_in, tcp_proto_out,
              udp_proto_count, udp_proto_in, udp_proto_out,
              ssh_in, ssh_out, rdp_in, rdp_out,
              TCPports, UDPports)
    }
    section_header=read_bytes(hex, ai, 3)
    if (section_header != "da4752")
      quit("section header da4752 expected: " section_header, 1, 1)
    ai += 3
    db_version=read_bytes(hex, ai, 1)
    if (db_version != "01")
      quit("file format supported only in version 01", 1, 1)
    ai += 1
    last_time=read_number(ai, 8)
    ai += 8
    readGraphsection("60 seconds")
    readGraphsection("60 minutes")
    readGraphsection("24 hours")
    readGraphsection("31 days")
    # The complete file has been parsed, no bytes should be left over.
    # Terminate with return value 0 if the byte numbers match.
    quit("", (addr != ai+1) ?0:1, addr != ai+1)
  }
  ' > ${CSVFILENAME}
