#
# an example of R for CMSC 331, Fall 2011
# Charles Nicholas, CSEE Dept., UMBC
#

# 
# In R, use the command "source('myhash.r')" to load this file
# but use at your own risk!
#

# In recent months I've grown interested in "fuzzy" file comparisons
#   based on the normalized compressed distance, or NCD.  As a result,
#   I've had to run some experiments where I simulate the operation of
#   a compression scheme called LZW compression.  That algorithm, in
#   turn, uses hashing in a big way.  I could just borrow some GPL
#   code, but I wanted to learn more about R, so I used that language
#   instead.

# we implement hash tables for keys where each key is a sequence of
# one or more integers in the range 0-255.  Easily equivalent to raw vectors


newHash <- function(size=1097) {  # 1097 happens to be prime
# make a new hash table
  nEntries = 0
  chainLengths = rep(as.integer(0),times=size)
  table = rep(NULL, times=size)
  return(list(size=size, nEntries=nEntries, chainLengths=chainLengths,
  	 table=table))
}

insert <- function(aHash, key, val) {
  # key may be a sequence of integers
  print(paste("insert: key is", key))
  print(paste("insert: val is", val))
#  print(paste("insert: size is", aHash$size))
  h = hash(key,m=aHash$size)
#  print(paste("insert: hash is", h))
  aPair = list(key=key,val=val)
  chainLen = aHash$chainLengths[h]
  print(sprintf("insert: chainLen is %d", aHash$chainLengths[h]))
  if (chainLen == 0) {   # list is empty, insert the first element 
    aHash$table[h] = list(aPair)
    print(paste("insert: chain is ",aHash$table[h]))
  } else {
    # the list is not empty, so see if we're adding or replacing
    existsAt = 0
    chain=aHash$table[h]
    vapply(1:chainLen, function(i) {
        if (chain[i]$key == key) {
          existsAt <<- i
        }
      }
    )
    if (existsAt > 0) {
      aHash$table[h][[existsAt]] = aPair
    } else { 
      aHash$table[h] = c(aHash$table[h],aPair)
    }
  } # match else
  print(paste("added hash key", key, "at position ",h," in hash table"))
  aHash$chainLengths[h] = aHash$chainLengths[h] + 1
  aHash$nEntries = aHash$nEntries+1
  return(aHash)
}

printHash <- function(aHash) {
  # print all the hash keys and associated values
  print(sprintf("hash table of size:  %d",aHash$size))
  print(sprintf("number of entries:  %d",aHash$nEntries))
  chainLens = aHash$chainLengths
  result = lapply(1:aHash$size, function(i) {
      chain = aHash$table[i]
      chainLen = aHash$chainLengths[i]
      if (chainLen != 0) {
        print(sprintf("chain[%d] is of length %d", i, chainLen))
        lapply(1:chainLen, function(k) {
            listEntry = chain[[k]]
            print(paste("key:",listEntry$key,"value:",listEntry$val))
          }
        )
      }
    }
  )
}

getHashKeys <- function(aHash) {
  # return a list of all the hash keys
  print(sprintf("number of entries:  %d",aHash$nEntries))
  chainLens = aHash$chainLengths
  keyList = NULL
  result = lapply(1:aHash$size, function(i) {
      chain = aHash$table[i]
      chainLen = aHash$chainLengths[i]
      if (chainLen != 0) {
        lapply(1:chainLen, function(k) {
            listEntry = chain[[k]]   # double brackets since it's a list
	    if (length(keyList) > 0) { 
              keyList <<- c(keyList,listEntry$key)
	    } else {
	      keyList <<- list(listEntry$key)
	    }
          }
        )
      }
    }
  )
  return(keyList)
}

hash <- function(k, m) {
  # input  key is a vector of integers each in the range 0:255
  #        m is the hash table size, i.e. number of buckets
  # output an integer hash key

  # a variant on Knuth Variant on Division from some web site
  # if k is a single integer, i.e. a list of length 1,

  # I suspect this is a really lousy hash function...why would I say
  #  that?
 
#  print(paste("hash: k is ",k))
#  print(paste("hash: m is ",m))
  if (length(k) == 1) {
    h = (k[1]*(k[1]+3))%%m
  } else {  # it is a vector with 2 or more entries, so hash each 
    hk = vapply(k, function(i) {
        h = k[i]*(k[i]+3)%%m
      }, k
    )
    # then hash the sum
    k=sum(hk)
    h = k*(k+3)%%m
  }  
}

get <- function(aHash, key, val) {
  # key may be a sequence of integers
  h = hash(key,m=aHash$size)
  chain = aHash$table[h]
  chainLens = aHash$chainLengths
  found = FALSE
  if (chainLens[h] == 0) {
    val = -1  # no val associated with this key was found
  } else { 
    lapply(1:chainLens[h], function(i) {
        if (chain[[i]]$key == key) {
          found <<- TRUE
          val <<- chain[[i]]$val
        }
      }
    )
  }
  return(list(found=found,val=val))
}


test <- function() {
  # make a very small hash table for testing
  aHash = newHash(size=128)
  print("testing insert")
  aHash = insert(aHash, 162, "cat")	
#  printHash(aHash)
  aHash = insert(aHash, 2, "moose")	
  print("The hash keys so far include:")
  keyList = getHashKeys(aHash)
  print(sprintf("we have seen %d keys so far", length(keyList)))
  lapply(keyList, function(k) print(k))
  print("testing get")
  result = get(aHash, 162)
  if (result$found) {
    print(result$val)
  } else {
    print("not found")
  }
  result = get(aHash, 22)
  if (result$found) {
    print(result$val)
  } else {
    print("not found")
  }
#  printHash(aHash)
}