# # an example of R for CMSC 331, Fall 2011 # Charles Nicholas, CSEE Dept., UMBC # # # In R, use the command "source('myhash.r')" to load this file # but use at your own risk! # # In recent months I've grown interested in "fuzzy" file comparisons # based on the normalized compressed distance, or NCD. As a result, # I've had to run some experiments where I simulate the operation of # a compression scheme called LZW compression. That algorithm, in # turn, uses hashing in a big way. I could just borrow some GPL # code, but I wanted to learn more about R, so I used that language # instead. # we implement hash tables for keys where each key is a sequence of # one or more integers in the range 0-255. Easily equivalent to raw vectors newHash <- function(size=1097) { # 1097 happens to be prime # make a new hash table nEntries = 0 chainLengths = rep(as.integer(0),times=size) table = rep(NULL, times=size) return(list(size=size, nEntries=nEntries, chainLengths=chainLengths, table=table)) } insert <- function(aHash, key, val) { # key may be a sequence of integers print(paste("insert: key is", key)) print(paste("insert: val is", val)) # print(paste("insert: size is", aHash$size)) h = hash(key,m=aHash$size) # print(paste("insert: hash is", h)) aPair = list(key=key,val=val) chainLen = aHash$chainLengths[h] print(sprintf("insert: chainLen is %d", aHash$chainLengths[h])) if (chainLen == 0) { # list is empty, insert the first element aHash$table[h] = list(aPair) print(paste("insert: chain is ",aHash$table[h])) } else { # the list is not empty, so see if we're adding or replacing existsAt = 0 chain=aHash$table[h] vapply(1:chainLen, function(i) { if (chain[i]$key == key) { existsAt <<- i } } ) if (existsAt > 0) { aHash$table[h][[existsAt]] = aPair } else { aHash$table[h] = c(aHash$table[h],aPair) } } # match else print(paste("added hash key", key, "at position ",h," in hash table")) aHash$chainLengths[h] = aHash$chainLengths[h] + 1 aHash$nEntries = aHash$nEntries+1 return(aHash) } printHash <- function(aHash) { # print all the hash keys and associated values print(sprintf("hash table of size: %d",aHash$size)) print(sprintf("number of entries: %d",aHash$nEntries)) chainLens = aHash$chainLengths result = lapply(1:aHash$size, function(i) { chain = aHash$table[i] chainLen = aHash$chainLengths[i] if (chainLen != 0) { print(sprintf("chain[%d] is of length %d", i, chainLen)) lapply(1:chainLen, function(k) { listEntry = chain[[k]] print(paste("key:",listEntry$key,"value:",listEntry$val)) } ) } } ) } getHashKeys <- function(aHash) { # return a list of all the hash keys print(sprintf("number of entries: %d",aHash$nEntries)) chainLens = aHash$chainLengths keyList = NULL result = lapply(1:aHash$size, function(i) { chain = aHash$table[i] chainLen = aHash$chainLengths[i] if (chainLen != 0) { lapply(1:chainLen, function(k) { listEntry = chain[[k]] # double brackets since it's a list if (length(keyList) > 0) { keyList <<- c(keyList,listEntry$key) } else { keyList <<- list(listEntry$key) } } ) } } ) return(keyList) } hash <- function(k, m) { # input key is a vector of integers each in the range 0:255 # m is the hash table size, i.e. number of buckets # output an integer hash key # a variant on Knuth Variant on Division from some web site # if k is a single integer, i.e. a list of length 1, # I suspect this is a really lousy hash function...why would I say # that? # print(paste("hash: k is ",k)) # print(paste("hash: m is ",m)) if (length(k) == 1) { h = (k[1]*(k[1]+3))%%m } else { # it is a vector with 2 or more entries, so hash each hk = vapply(k, function(i) { h = k[i]*(k[i]+3)%%m }, k ) # then hash the sum k=sum(hk) h = k*(k+3)%%m } } get <- function(aHash, key, val) { # key may be a sequence of integers h = hash(key,m=aHash$size) chain = aHash$table[h] chainLens = aHash$chainLengths found = FALSE if (chainLens[h] == 0) { val = -1 # no val associated with this key was found } else { lapply(1:chainLens[h], function(i) { if (chain[[i]]$key == key) { found <<- TRUE val <<- chain[[i]]$val } } ) } return(list(found=found,val=val)) } test <- function() { # make a very small hash table for testing aHash = newHash(size=128) print("testing insert") aHash = insert(aHash, 162, "cat") # printHash(aHash) aHash = insert(aHash, 2, "moose") print("The hash keys so far include:") keyList = getHashKeys(aHash) print(sprintf("we have seen %d keys so far", length(keyList))) lapply(keyList, function(k) print(k)) print("testing get") result = get(aHash, 162) if (result$found) { print(result$val) } else { print("not found") } result = get(aHash, 22) if (result$found) { print(result$val) } else { print("not found") } # printHash(aHash) }