#
# should work for VX Heaven as well as exploit kit Javascript
# as well as Greek NT and Shakespeare, with the appropriate first pass
#
# to build TDM, run on dream-cs, e.g.
# python3 vx5.py -b -c ws -d WSbyAct -q
# to make graphs showing specimens, run it anywhere, e.g.
# python3 vx5.py -v -c vx -t "Title for Plot"
#

import argparse
import datetime
import getopt
import io
import math
import os
import pickle
import re
import scipy
import sys
import time

# probably won't use both
#import matplotlib.pyplot as plt
# pip install plotly needs to be done somewhere
import plotly
import plotly.plotly as py
import plotly.graph_objs as go

import numpy as np
import vxlib as vx

#from mpl_toolkits.mplot3d import Axes3D
#from mpl_toolkits.mplot3d import proj3d

from numpy.linalg import matrix_rank

import scipy
import scipy.sparse

from scipy.sparse.linalg import svds
from scipy import linalg
from sys import stdout

def k2idx4(k):
    anInt = ord(k[0:1]) * (2**24) 
    anInt += ord(k[1:2]) * (2**16)
    anInt += ord(k[2:3]) * (2**8) 
    anInt += ord(k[3:4])
    #assert (anInt >= 0) and (anInt < 0x100000000)
    return anInt

def idx2k4(idx):
    #assert (idx >= 0) and (idx < 0x100000000)
    # i1 + i2 + i3 + i4 == idx
    # each represents one byte
    i1,rem = divmod(idx, 2**24)
    i2,rem = divmod(rem, 2**16)
    i3,i4  = divmod(rem, 2**8)
    k = "%c%c%c%c" % (chr(i1), chr(i2), chr(i3), chr(i4))
    return k

def k2idx3(k):
    #assert len(k) == 3
    anInt = ord(k[0:1]) * (2**16) 
    anInt += ord(k[1:2]) * (2**8)
    anInt += ord(k[2:3])
    #assert (anInt >= 0) and (anInt < 0x1000000)
    return anInt

def idx2k3(idx):
    #assert (idx >= 0) and (idx < 0x1000000)
    # i1 + i2 + i3 == idx
    # each represents one byte
    i1,rem = divmod(idx, 2**16)
    i2,i3 = divmod(rem, 2**8)
    k = "%c%c%c" % (chr(i1), chr(i2), chr(i3))
    return k

# note directories are slash-terminated
# for testing
#aDir="./nurseryRhyme/"

# Shakespeare
#aDir="./WS/"

# exploit kit Javascript
#aDir="/media/data/parse_data/Javascript/_data/"  # on dream-lab.cs.umbc.edu
#logDir="/media/data/parse_data/out/"  # on dream-lab.cs.umbc.edu

# VX Heaven
#aDir="./smallest/"
#aDir="/media/data/vxheaven/smaller/"  # on dream-lab.cs.umbc.edu
#aDir="/Volumes/MyBook/vxheaven/small/"  # on office Mac
#aDir="/media/data/vxheaven/small/"  # on dream-lab.cs.umbc.edu
#aDir="/media/data/vxheaven/viruses-2010-05-18/"  # on dream-lab.cs.umbc.edu
#aDir="/Volumes/MyBook/vxheaven/viruses-2010-05-18/"  # on office Mac
#aDir="/home/charles/viruses-2010-05-18/"  # on office Mac


t = time.process_time()

def mesg(aString):
    print("%7.2f %s" % (time.process_time() - t, aString))

def selectSomeFiles(aDir, matchText, sampleFactor, nickname):

    # get list of files in the indicated directory
    # make sure aDir ends with a slash, so that full path
    # names can be used

    allFiles = vx.getFileList(dir=aDir)
    if (aDir[:-1] == '/'):
        pass1prefix = aDir[:-1]
    else:
        pass1prefix = aDir
        aDir = aDir+'/'

    # include any file that matches any of these strings
    #selectThese = ["n","t"]  # match all nursery rhyme files
    #selectThese = ["txt"]  # match all Shakespeare files
    selectThese = [".pcap"]  # match all EK Javascript files
    #selectThese = ["."]  # match all files
    #selectThese = ["Trojan-Downloader","Trojan-GameThief"] # or a few
    #selectThese = [".ex0"]

    # and then exclude any files that match any of these strings
    #excludeThese = ["not-virus","not-a-virus","EICAR"]
    #excludeThese = ["tooth.net","crystalplaza","cgreefclub","patiodoors"]
    excludeThese = [".pickle"]
    if matchText == "":
        possibleFiles = [aFile for aFile in allFiles 
                 if (not any(bStr in aFile for bStr in excludeThese))]
    else:
        selectThese = [matchText]
        possibleFiles = [aFile for aFile in allFiles 
                 if (any(aStr in aFile for aStr in selectThese) and 
                    (not any(bStr in aFile for bStr in excludeThese)))]

    selectedFiles = [aDir+aFile for (idx,aFile) 
                     in enumerate(possibleFiles) 
                     if (idx % sampleFactor == 0)]
    mesg("Sorting list of files to be read")
    selectedFiles = sorted(selectedFiles)

    pFile = aDir+nickname+'.selectedFiles.pickle'
    mesg("Brief delay: writing to %s" % pFile)
    pickle.dump([selectedFiles], open(pFile,'wb'), pickle.HIGHEST_PROTOCOL)
    #print(selectedFiles)
    return selectedFiles
    
def buildTDMpass1( n, aDir, selectedFiles, pickleFile, nickname, quick):

    # we know which files to look at
    # read each of them, tabulate n-grams, and return the Dfs
    # vector which gives the number of documents in which each term
    # occurs
    # save this info in a pickle file since it's expensive to get
    # the threshold means to ignore terms that occur in that many
    # files or less

    mesg("Using n=%d, nSelectedFiles=%d" % (n, len(selectedFiles)))

    assert (n == 4) or (n==3)   # limited range for n

    if n==4:
        k2idx = k2idx4
        idx2k = idx2k4
        assert 2**32 == 4294967296
        assert idx2k(k2idx("abcd")) == "abcd"
        arraySize = 0x100000000
    else:
        k2idx = k2idx3
        idx2k = idx2k3
        assert 2**24 ==   16777216
        assert idx2k(k2idx(str("abc"))) == "abc"
        arraySize = 0x1000000

    nSelectedFiles = len(selectedFiles)
    mesg("Will read %d files" % (nSelectedFiles))
    #print(selectedFiles)
    nFilesRead = 0

    # we make a first pass over the selectedFiles
    # keep track of Dfs for each term
    # choose the terms that are neither too common nor too rare
    # load them into a dictionary to be used in the second pass

    # termDfs is an integer array, one int for each POSSIBLE n-gram
    # if that n-gram occurs in the corpus, then the corresponding entry is
    # non-zero.
    mesg("Initializing array")
    termDfs = np.zeros(arraySize, dtype=np.int32) 
    maxFreq = 2**32 - 1

    # make sure termDfs is initialized to zeros
    assert termDfs[0] == 0 and termDfs[arraySize-1] == 0

    termCount = 0
    docCount = 0

    for aFile in selectedFiles:
        docTfs = vx.dictOfFile(aFile,n)
        mesg("File %d/%d: read %7d %d-grams from %s" % 
              (docCount+1, nSelectedFiles,len(docTfs.keys()), n, aFile))
        docCount += 1

        def updater(k):
            # for each term, update the termDfs
            # convert each n-gram to the corresponding integer
            idx = k2idx(k)
            #print("updater: found term %s" % k)
            nonlocal termCount

            # is this a new term?
            if termDfs[idx] == 0:
                termCount += 1
        
            # for any term, new or not, increment document frequency,
            # unless it would cause an overflow
            if termDfs[idx] < maxFreq:
                termDfs[idx] += 1

        [updater(k) for k in docTfs.keys()]
        # to report progress in term set after each file
        #mesg("Unique %d-grams seen so far: %d" % (n, termCount))

    mesg("number of documents read: %d" % docCount)
    assert np.count_nonzero(termDfs) == termCount
    mesg("corpus has %d terms" % termCount)

    pFile = nickname+'dfs.pickle'
    if quick:
        mesg("Omit writing of %s" % pFile)
    else:
        mesg("Writing to %s" % pFile)
        pickle.dump([selectedFiles, termDfs], open(pFile,'wb'),
                    pickle.HIGHEST_PROTOCOL)
    return termDfs

def selectSomeTerms( n, aDir, selectedFiles, termDfs, 
                     pickleFile, lowThreshold, highThreshold, nickname):

    if n==4:
        k2idx = k2idx4
        idx2k = idx2k4
        assert 2**32 == 4294967296
        assert idx2k(k2idx("abcd")) == "abcd"
        arraySize = 0x100000000
    else:
        k2idx = k2idx3
        idx2k = idx2k3
        assert 2**24 ==   16777216
        assert idx2k(k2idx(str("abc"))) == "abc"
        arraySize = 0x1000000

    # examine the termDfs vector, and for each term we want to keep
    # load it into the selectedTerms dictionary

    mesg("Using low cutoff of %d (0 means keep singletons)" % lowThreshold)
    nSelectedTerms = np.count_nonzero(termDfs)
    
    if lowThreshold > 0:
        mesg("Before removing infrequent terms: %10d" % nSelectedTerms)
        termDfs[np.where(termDfs <= lowThreshold)] =  0
        nSelectedTerms = np.count_nonzero(termDfs)
        mesg("After removing infrequent terms:  %10d" % nSelectedTerms)

    # express threshold for too common terms as a percentage 
    # of the number of selected files
    highCutoff = len(selectedFiles) * highThreshold/100
    mesg("Using high cutoff of %d percent (max 100 means no high cutoff)" % 
         highThreshold)
    assert highThreshold <= 100
    if highThreshold < 100:
        termDfs[np.where(termDfs > highCutoff)] =  0
        nSelectedTerms = np.count_nonzero(termDfs)
        mesg("Number of terms to be kept:    %10d" % nSelectedTerms)

    selectedTerms = {}

    def updateVocab(i, aTerm):
        if i % 100000 == 0:
            stdout.write("                                      %10d\r" % i)
        selectedTerms[aTerm] = i
        
    mesg("Adding %d terms to vocabulary" % nSelectedTerms)
    [updateVocab(i, idx2k(aTerm)) for i,aTerm in
     enumerate(np.flatnonzero(termDfs))]

    if (aDir[:-1] == '/'):
        pass1prefix = aDir[:-1]
    else:
        pass1prefix = aDir
        aDir = aDir+'/'

    pFile = aDir+nickname+'.selectedTerms.pickle'
    mesg("Brief delay: writing to %s" % pFile)
    pickle.dump([selectedTerms],
                open(pFile,'wb'), 
                pickle.HIGHEST_PROTOCOL)
    return selectedTerms

def buildTDMpass2(n, aDir, selectedFiles, selectedTerms, 
                  normalize, printTDM, pickleFile, nickname):

    mesg("Entering buildTDMpass2, n is %d, aDir is %s " % (n, aDir))
    nSelectedFiles = len(selectedFiles)
    nSelectedTerms = len(selectedTerms.keys())
    mesg("nSelectedFiles: %d" % nSelectedFiles)
    mesg("nSelectedTerms: %d" % nSelectedTerms)

    if normalize:
        mesg("TDM will be length normalized")
    if printTDM:
        mesg("TDM will be printed")
        
    # hook the read function so we can see what's happening
    # verbose files are file like objects, for our purpose, since
    # they have their own read function
    nFilesRead = 0

    class verboseFile(io.BufferedIOBase):
        def __init__(self, aFile):
            self.pathName = aFile

        def read(self):
            nonlocal nFilesRead
            nFilesRead += 1
            print ("     File %d/%d, %s, of size %d" % 
                   (nFilesRead, nSelectedFiles,
                    os.path.basename(self.pathName), 
                    os.path.getsize(self.pathName)))
            with open(self.pathName, "rb") as fh:
                buffer = fh.read()
            return buffer
    
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(
        input = 'file',          # send it list of file-like objects 
        analyzer = 'char',
        ngram_range=(n,n),
        lowercase=False,
#        max_features=1000000,     # use only top million features
        dtype=np.int64,          # some counts can get big
#        max_df = 0.5,              # discard terms that are too common
#        min_df = lowthreshold,        # 1 means keep all terms
        vocabulary = selectedTerms,
        decode_error="ignore"
    )

    # vectorizer returns a COO format sparse matrix

    batchSize = 1000  # 10 for testing, 1000 or more for serious use
    if nSelectedFiles > batchSize:

        counter = 0
        remainingFiles = selectedFiles

        # bite off first batch of files
        theseFiles = remainingFiles[:batchSize]
        theFiles = [verboseFile(f) for f in theseFiles]

        # initialize the tdm
        tdm = vectorizer.fit_transform(theFiles)
        #mesg("TDM shape is %s" % str(tdm.shape) )
        mesg("added %d files to tdm" % len(theseFiles))
        counter += len(theseFiles)
        remainingFiles = remainingFiles[batchSize:]

        while counter < nSelectedFiles:
            # bite off next batch of files
            theseFiles = remainingFiles[:batchSize]
            theFiles = [verboseFile(f) for f in theseFiles]
            counter += len(theseFiles)

            # add some new rows at the bottom of the existing tdm
            tdm = scipy.sparse.vstack(
                (tdm, vectorizer.fit_transform(theFiles)))

            mesg("added %d files to tdm" % len(theseFiles))
            remainingFiles = remainingFiles[batchSize:]

    else:
        theFiles = [verboseFile(f) for f in selectedFiles]
        tdm = vectorizer.fit_transform(theFiles)            
        
    #mesg("TDM shape is %s" % str(tdm.shape) )
    
    #
    # at this point, tdm has a ROW for each document,
    # and a COLUMN for each term
    #
    assert nSelectedFiles == tdm.shape[0]
    nTerms = tdm.shape[1]
    mesg("number of documents read: %d" % nSelectedFiles)
    mesg("number of terms observed: %d" % nTerms)

    # the following code assumes TDM has one row per term
    # and one column per document, so transpose
    mesg("Transposing the raw TDM")
    tdm = tdm.transpose()

    # tdm has the number of times term i occurs in document j
    # term i is the ith element of termSet
    # document j is the jth element of selectedFiles, which is sorted
    # selectedTerms is a hash, and its keys are sorted to create termSet
    mesg("Sorting the term set")
    termSet = sorted(selectedTerms.keys())

    if printTDM:
        mesg("Print the raw TDM")
        for aRow in range(0, nTerms):
            print("Row %d (term \"%4s\"):\n" %
                  (aRow, termSet[aRow]), end='')
            for aCol in range(0, nSelectedFiles-1):
                stdout.write("%3d, " % tdm[aRow, aCol])
            print("%3d" % tdm[aRow, nSelectedFiles-1])

    # what is the density of the matrix?
    # the sparse matrix package keeps count of the non-zero entries,
    #mesg("Number of non-zero entries in TDM: %d" % tdm.getnnz())
    mesg("Density of raw TDM is %f (max is 1.0)" %
         (tdm.getnnz()/(nTerms * nSelectedFiles)))

    # convert to csr
    mesg("Converting raw TDM from coo to csr format")
    tdm = scipy.sparse.csr_matrix(tdm)
    tdm = tdm.asfptype()

    # save data structures for use in use phase
    mesg("Writing to %s" % pickleFile)
    pickle.dump([tdm, termSet, selectedFiles],open(pickleFile,'wb'), 
                pickle.HIGHEST_PROTOCOL)
    elapsed_time = time.process_time() - t
    mesg("Finished building TDM")
    
def useTDM(pickleFile, logDir, plotTitle, printTDM,
           nDims2Plot, nickname, usetfidf, uselogh):

    # if this fails, then plotly isn't working at all    
    print("Using plotly version %s" % plotly.__version__)
    
    print("Reading from %s" % pickleFile)
    [tdm, termSet, selectedFiles] = pickle.load(open(pickleFile,'rb'))
    assert(tdm.getformat() == 'csr')
    nTerms = len(termSet)
    nSelectedFiles = len(selectedFiles)
    assert nTerms == tdm.shape[0]
    assert nSelectedFiles == tdm.shape[1]

    # get rid of files we don't want to consider

    assert math.log(8,2) == 3     # python knows about binary logs

    if usetfidf:
        mesg("calculating tfidf TDM")
        twtdm = tdm     # easy way to create tfidfTDM with the right shape
        dfs = np.diff(tdm.indptr)   # get the number of nonzeros in each row
        for i in range(nTerms):
            idf = math.log(nSelectedFiles/dfs[i], 2)
            twtdm[i,:] = tdm[i,:] * idf

    elif uselogh:
        mesg("calculating log entropy TDM")
        twtdm = scipy.sparse.lil_matrix(tdm)   # change to lil format
        for i in range(nTerms):

            thisRow = np.matrix.getA1(tdm[i,:].todense())
            thisRowSum = sum(thisRow)
            pij = thisRow

            if thisRowSum == 0:
                numer = 0
            else:
                pij = thisRow/thisRowSum
                # compute binary logs of nonzero entries in pij
                log2pij = pij
                log2pij[np.nonzero(pij)] = np.log2(pij[np.nonzero(pij)])
                numer = np.inner(pij, log2pij)

            if (i % 5000) == 0:
                print("progress: for term %d, numer is %f " % (i,numer) )

            rowWeight = 1+(-numer/(math.log(nSelectedFiles+1)))
            twtdm[i,:] = np.log2(thisRow+1)*rowWeight
            #for j in np.nonzero(twtdm[i,:])[0]:
                #twtdm[i,j] = math.log(twtdm[i,j]+1,2)*rowWeight
        twtdm = scipy.sparse.csr_matrix(twtdm)

    else:
        twtdm = tdm

    # get rid of values that are too close to zero to worry about
    #tol = 1
    #twtdm[tol > abs(twtdm)] = 0.0
    #mesg("Density of term-weighted TDM (tol of %f) is %f (max is 1.0)" %
    #     (tol, twtdm.getnnz()/(nTerms * nSelectedFiles)))

    # if we're to subtract the centroid, here would be the place
    # there's a function for that in vxlib

    #twtdm = scipy.sparse.csr_matrix(twtdm)   # change back to csr format
    assert(twtdm.getformat() == 'csr')

    # keep only those rows that have non-zero entries
    # since rows of zeros make svds unhappy
    # thanks to a great stack overflow post for this idea
    oldR, oldC = twtdm.shape
    mesg("Old TWM has %d rows and %d columns" % (oldR, oldC))
    num_nonzeros = np.diff(twtdm.indptr)
    newtdm = twtdm[ num_nonzeros != 0 ]
    newR, newC = newtdm.shape
    mesg("After removing all-zero rows, TWM has %d rows and %d columns" %
          (newR, newC))

    # if any columns (documents) are now all zero, inform the user
    emptyCols = newtdm.getnnz(0) == 0
    if np.sum(emptyCols) > 0:
        mesg("Careful!  Some (%d) documents are empty:" % np.sum(emptyCols))
        print(np.array(selectedFiles)[ emptyCols ])

    # we have the TDM, so we can do an isomap if we wish
    # but for now save some memory
    del tdm

    # maybe we want to print the term-weighted TDM
    if printTDM:
        mesg("Print the term-weighted TDM")
        for aRow in range(nTerms):
            print("Row %d (term \"%4s\"):\n" % 
                  (aRow, termSet[aRow]), end='')
            for aCol in range(0, nSelectedFiles-1):
                stdout.write("%4f, " % twtdm[aRow, aCol])
                print("%4f" % twtdm[aRow, nSelectedFiles-1])

    mesg("Computing sparse svd")
    U, s, Vh = svds(twtdm, min(6, min(twtdm.shape)-1) )
    # svds returns s in ascending order, not descending
    s = sorted(s,reverse=True)   
    mesg("Singular values:")
    print(s)
    
    # the top r rows of Vh are the coordinates of the documents 
    # in the vector space spanned by the leftmost r columns of U.
    # the entries in s are scaling factors
    # maybe multiplying by the singular values isn't necessary...
    xs = Vh[0,]
    ys = Vh[1,]
    zs = Vh[2,]
    qs = Vh[3,]
    #xs = s[0]*Vh[0,]
    #ys = s[1]*Vh[1,]
    #zs = s[2]*Vh[2,]
    #qs = s[3]*Vh[3,]

    # for color names see
    # http://www.w3schools.com/colors/colors_names.asp
    # loop to change all GameThief to red
    colorVec = ['no default color'] * len(selectedFiles)
    plotMe = np.ones(len(selectedFiles))

    for idx,aFile in enumerate(selectedFiles):

        if nickname == "vx":

            if 'Rootkit' in aFile:
                colorVec[idx] = 'Brown'
            elif 'KeyLogger' in aFile:
                colorVec[idx] = 'Green'
            else:
                colorVec[idx] = 'Linen'
                plotMe[idx] = 0

        elif nickname == "ek":
            
            # see if we can find a logFile where this
            # specimen was labeled with an EK
            logFile = logDir+os.path.basename(aFile)+"/fast.log"

            # reminds us of this useful command
            # find . -name fast.log -exec grep 'Redkit' {} \;

            try:
                with open(logFile,'r') as f:

                    def checkKit(marker, chosenColor):
                        nonlocal colorVec
                        #print("Found %s alert in fast.log for file %s:" % 
                        #      (marker,aFile))
                        colorVec[idx] = chosenColor

                    buffer = f.read()
                    if "Angler" in buffer:
                        checkKit("Angler",'salmon')
                    elif "Blackhole" in buffer:
                        checkKit("Blackhole",'black')
                    #elif "Magnitude" in buffer:
                    #    checkKit("Magnitude",'brown')
                    #elif "Neutrino" in buffer:
                    #    checkKit("Neutrino",'purple')
                    #elif "Nuclear" in buffer:
                    #    checkKit("Nuclear",'blue')
                    elif "Redkit" in buffer:
                        checkKit("Redkit",'red')
                    #elif "Redkit/Sakura" in buffer:
                    #    checkKit("Redkit",'pink')
                    # one Sweet Orange, in 
                    # englishrussia.com-2014-04-16-0400.pcap 
                    # check for a single word since a line break may happen
                    elif "Orange" in buffer:
                        #print("Found Sweet Orange in pcap %s" % aFile)
                        checkKit("Sweet Orange",'orange')
                    elif "exploit kit" in buffer:
                        checkKit("unknown",'blue')
                    else:
                        # linen makes a nice default color
                        colorVec[idx] = 'Linen'
            except IOError as e:
                    print("Cannot open %s for reading" % logFile)

        elif nickname == "ws":

            if 'Verona' in aFile: 
                colorVec[idx] = 'rgba(200,  50,  50, .9)'
            elif ('Henry' in aFile):
                colorVec[idx] = 'Green'
            else:
                # linen makes a nice default color
                colorVec[idx] = 'Linen'

        elif nickname == "NT":

            if 'matt' in aFile:
                colorVec[idx] = 'Purple'
            elif 'mark' in aFile:
                colorVec[idx] = 'Scarlet'
            elif 'luke' in aFile:
                colorVec[idx] = 'Green'
            elif 'john' in aFile:
                colorVec[idx] = 'Blue'
            elif 'heb' in aFile:
                colorVec[idx] = 'Salmon'
            else:
                colorVec[idx] = 'Linen'

        else:
            mesg("unknown nickname, no colors assigned")

    # option to not plot points with default color
    # making points of interest easier to see, and webgl happier
    plotAll = True
    if plotAll:
        x = xs
        y = ys
        z = zs
        theseFiles = selectedFiles
        theseColors = colorVec
    else:
        plotThese = np.nonzero(plotMe)[0]
        print("size of plotThese is %s" % len(plotThese))
        x = xs[plotThese]
        y = ys[plotThese]
        z = zs[plotThese]
        theseFiles = ["no such file"] * len(plotThese)
        theseColors = ['no default color'] * len(plotThese)
        for j, idx in enumerate(plotThese):
            theseFiles[j] = selectedFiles[idx]
            theseColors[j] = colorVec[idx]
        
    
    if nDims2Plot == 3:  
        # from the plotly web page
        trace1 = go.Scatter3d(
            x=x,
            y=y,
            z=z,
            mode='markers',
            marker=dict(
                size=12,
                color=theseColors,
                opacity = 0.8
            ),
            text=theseFiles
        )
    else: # it must be a 2d plot
        trace1 = go.Scatter(
            x=x,
            y=y,
            #z=z,
            mode='markers',
            marker=dict(
                size=12,
                color=theseColors,
                opacity = 0.8
            ),
            text=theseFiles
        )

    data = [trace1]
    layout = go.Layout(
        title = plotTitle,
        hovermode='closest',
        showlegend=False,
        margin=dict(
            l=0,
            r=0,
            b=0,
            t=0
        )
    )
    fig = go.Figure(data=data, layout=layout)
    plotly.offline.plot(fig, filename='plots/'+plotTitle+'.html')  

def main(argv):

    parser = argparse.ArgumentParser()
#
# must specify the corpus nickname and the pickle file, in either -b or -v
#
    parser.add_argument("-c","--corpus", nargs=1, 
                        help="nickname for corpus", 
                        dest="nickname", 
                        default=["no such corpus nickname"])
    parser.add_argument("-p","--pickleFile", nargs=1, 
                        help="pickle file to use", 
                        dest="pickleFile",
                        default=["no pickle file"])
#
# use for build phase.  either -b or -v is required.
#
    parser.add_argument("-b","--build", 
                        help="read raw data and write pickle file", 
                        default=False, action='store_true')
    parser.add_argument("-d","--directory", nargs=1, 
                        help="directory for raw data", 
                        dest="aDir", 
                        default=["/media/data/parse_data/Javascript/_data/"])
    parser.add_argument("-high","--highthreshold", nargs=1, 
                        dest="highthreshold", 
                        help="discard terms that occur too often",
                        # the default is remove terms that occur in
                        # more than this percentage of the files
                        type=int, default=[100])
    # default threshold of 1 keeps all terms
    parser.add_argument("-low","--lowthreshold", nargs=1, dest="lowthreshold", 
                        help="discard terms that occur less often than this", 
                        type=int, default=[1])
    parser.add_argument("-m","--match", nargs=1, 
                        help="use files that match a pattern", 
                        dest="matchText", default=[""])
    parser.add_argument("-n","--ngramLen", nargs=1, dest="ngramLen", 
                        help="length of ngram", 
                        type=int, default=[4])
    parser.add_argument("-norm","--normalize", 
                        help="normalize columns in TDM", 
                        default=False, action='store_true')
    parser.add_argument("-q","--quick", 
                        help="skip saving the Dfs", 
                        default=False, action='store_true')
    parser.add_argument("-pr","--printTDM", 
                        help="print the TDM", 
                        default=False, action='store_true')
    parser.add_argument("-s","--sample", nargs=1, dest="sampleFactor", 
                        help="sample using one out of so many files", 
                        type=int, default=[1])
    parser.add_argument("-skip","--skipPass1", 
                        help="skip pass 1, read Dfs from dfs.pickle", 
                        default=False, action='store_true')
    parser.add_argument("-z","--zipfile", nargs=1, 
                        help="zip file for raw data", 
                        dest="aZipfile", 
                        default=["/media/data/parse_data/Javascript/aZipfile"])
#
# use with visualize phase.  either -b or -v is required
# 
    parser.add_argument("-v","--visualize", 
                        help="read pickle file and make 2D or 3D plot",
                        default=False, action='store_true')
    parser.add_argument("-2d","--plot2d", 
                        help="specify 2D plot", 
                        default=False, action='store_true')
    parser.add_argument("-3d","--plot3d", 
                        help="specify 3D plot", 
                        default=True, action='store_true')
    parser.add_argument("-logh","--uselogh", 
                        help="use log entropy term weighting", 
                        default=False, action='store_true')
    parser.add_argument("-tfidf","--usetfidf", 
                        help="use tf.idf term weighting", 
                        default=False, action='store_true')
    parser.add_argument("-l","--logs", nargs=1, 
                        help="directory for log data", 
                        dest="logDir", 
                        default=["/media/data/parse_data/out/"])
    parser.add_argument("-t","--title", nargs=1, 
                        help="Title to use for plot", 
                        dest="plotTitle", default=["Default title"])
    
    args = parser.parse_args()
    print(datetime.datetime.now())
    if args.pickleFile[0] == "no pickle file":
        args.pickleFile[0] = args.nickname[0]+".pickle"
    
    if args.build:    
        #
        # build phase, where raw data is read and pickle file
        # with all the terms as written
        # followed by pickle file with 
        #
        print("Using n= %d" % args.ngramLen[0])    
        print("Corpus nickname (should make sense with directory): %s"
              % args.nickname[0])
        # if -z is specified, set up to read from that instead of 
        # -d conflicts with -z
        print("Data directory: %s" % args.aDir[0])

        if args.normalize:
            print("Data objects will be length-normalized")
        print("pickleFile is", args.pickleFile[0])
        print("Sampling factor: %d" % args.sampleFactor[0])

        if not args.skipPass1:
            selectedFiles = selectSomeFiles(
                args.aDir[0],
                args.matchText[0],
                args.sampleFactor[0],
                args.nickname[0]
            )
            #print(selectedFiles)
        
            termDfs = buildTDMpass1(
                args.ngramLen[0],
                args.aDir[0],
                selectedFiles,
                args.pickleFile[0],
                args.nickname[0],
                args.quick
            )

        else:
            pFile = args.nickname[0]+".dfs.pickle"
            mesg("Reading selectedFiles and termDfs from "+pFile)
            [selectedFiles, termDfs] = pickle.load(
                    open(pFile,'rb'))
            
        selectedTerms = selectSomeTerms(
            args.ngramLen[0],
            args.aDir[0],
            selectedFiles,
            termDfs,
            args.pickleFile[0],
            args.lowthreshold[0],
            args.highthreshold[0],
            args.nickname[0]
            )
            #print(selectedTerms)

        tdm = buildTDMpass2(
            args.ngramLen[0],
            args.aDir[0],
            selectedFiles,
            selectedTerms,
            args.normalize,
            args.printTDM,
            args.pickleFile[0],
            args.nickname[0]
            )
        mesg("built TDM")
        #
        # write the pickle file with the TDM
        #
        
    if args.visualize:
        #
        # visualize (or otherwise work with) data in pickle file
        #
        print("pickleFile is", args.pickleFile[0])
        print("Plot title will be %s" % args.plotTitle[0])                   
        if args.plot2d:
            nDims2Plot = 2
            print("Making a 2-d plot")
        else:
            nDims2Plot = 3
            print("Making a 3-d plot")
            
        useTDM(args.pickleFile[0], args.logDir[0], 
               args.plotTitle[0], args.printTDM, nDims2Plot, args.nickname[0],
               args.usetfidf,args.uselogh)
    print("That's all, folks!")
    print(datetime.datetime.now())
    exit()

main(sys.argv)