# # should work for VX Heaven as well as exploit kit Javascript # as well as Greek NT and Shakespeare, with the appropriate first pass # # to build TDM, run on dream-cs, e.g. # python3 vx5.py -b -c ws -d WSbyAct -q # to make graphs showing specimens, run it anywhere, e.g. # python3 vx5.py -v -c vx -t "Title for Plot" # import argparse import datetime import getopt import io import math import os import pickle import re import scipy import sys import time # probably won't use both #import matplotlib.pyplot as plt # pip install plotly needs to be done somewhere import plotly import plotly.plotly as py import plotly.graph_objs as go import numpy as np import vxlib as vx #from mpl_toolkits.mplot3d import Axes3D #from mpl_toolkits.mplot3d import proj3d from numpy.linalg import matrix_rank import scipy import scipy.sparse from scipy.sparse.linalg import svds from scipy import linalg from sys import stdout def k2idx4(k): anInt = ord(k[0:1]) * (2**24) anInt += ord(k[1:2]) * (2**16) anInt += ord(k[2:3]) * (2**8) anInt += ord(k[3:4]) #assert (anInt >= 0) and (anInt < 0x100000000) return anInt def idx2k4(idx): #assert (idx >= 0) and (idx < 0x100000000) # i1 + i2 + i3 + i4 == idx # each represents one byte i1,rem = divmod(idx, 2**24) i2,rem = divmod(rem, 2**16) i3,i4 = divmod(rem, 2**8) k = "%c%c%c%c" % (chr(i1), chr(i2), chr(i3), chr(i4)) return k def k2idx3(k): #assert len(k) == 3 anInt = ord(k[0:1]) * (2**16) anInt += ord(k[1:2]) * (2**8) anInt += ord(k[2:3]) #assert (anInt >= 0) and (anInt < 0x1000000) return anInt def idx2k3(idx): #assert (idx >= 0) and (idx < 0x1000000) # i1 + i2 + i3 == idx # each represents one byte i1,rem = divmod(idx, 2**16) i2,i3 = divmod(rem, 2**8) k = "%c%c%c" % (chr(i1), chr(i2), chr(i3)) return k # note directories are slash-terminated # for testing #aDir="./nurseryRhyme/" # Shakespeare #aDir="./WS/" # exploit kit Javascript #aDir="/media/data/parse_data/Javascript/_data/" # on dream-lab.cs.umbc.edu #logDir="/media/data/parse_data/out/" # on dream-lab.cs.umbc.edu # VX Heaven #aDir="./smallest/" #aDir="/media/data/vxheaven/smaller/" # on dream-lab.cs.umbc.edu #aDir="/Volumes/MyBook/vxheaven/small/" # on office Mac #aDir="/media/data/vxheaven/small/" # on dream-lab.cs.umbc.edu #aDir="/media/data/vxheaven/viruses-2010-05-18/" # on dream-lab.cs.umbc.edu #aDir="/Volumes/MyBook/vxheaven/viruses-2010-05-18/" # on office Mac #aDir="/home/charles/viruses-2010-05-18/" # on office Mac t = time.process_time() def mesg(aString): print("%7.2f %s" % (time.process_time() - t, aString)) def selectSomeFiles(aDir, matchText, sampleFactor, nickname): # get list of files in the indicated directory # make sure aDir ends with a slash, so that full path # names can be used allFiles = vx.getFileList(dir=aDir) if (aDir[:-1] == '/'): pass1prefix = aDir[:-1] else: pass1prefix = aDir aDir = aDir+'/' # include any file that matches any of these strings #selectThese = ["n","t"] # match all nursery rhyme files #selectThese = ["txt"] # match all Shakespeare files selectThese = [".pcap"] # match all EK Javascript files #selectThese = ["."] # match all files #selectThese = ["Trojan-Downloader","Trojan-GameThief"] # or a few #selectThese = [".ex0"] # and then exclude any files that match any of these strings #excludeThese = ["not-virus","not-a-virus","EICAR"] #excludeThese = ["tooth.net","crystalplaza","cgreefclub","patiodoors"] excludeThese = [".pickle"] if matchText == "": possibleFiles = [aFile for aFile in allFiles if (not any(bStr in aFile for bStr in excludeThese))] else: selectThese = [matchText] possibleFiles = [aFile for aFile in allFiles if (any(aStr in aFile for aStr in selectThese) and (not any(bStr in aFile for bStr in excludeThese)))] selectedFiles = [aDir+aFile for (idx,aFile) in enumerate(possibleFiles) if (idx % sampleFactor == 0)] mesg("Sorting list of files to be read") selectedFiles = sorted(selectedFiles) pFile = aDir+nickname+'.selectedFiles.pickle' mesg("Brief delay: writing to %s" % pFile) pickle.dump([selectedFiles], open(pFile,'wb'), pickle.HIGHEST_PROTOCOL) #print(selectedFiles) return selectedFiles def buildTDMpass1( n, aDir, selectedFiles, pickleFile, nickname, quick): # we know which files to look at # read each of them, tabulate n-grams, and return the Dfs # vector which gives the number of documents in which each term # occurs # save this info in a pickle file since it's expensive to get # the threshold means to ignore terms that occur in that many # files or less mesg("Using n=%d, nSelectedFiles=%d" % (n, len(selectedFiles))) assert (n == 4) or (n==3) # limited range for n if n==4: k2idx = k2idx4 idx2k = idx2k4 assert 2**32 == 4294967296 assert idx2k(k2idx("abcd")) == "abcd" arraySize = 0x100000000 else: k2idx = k2idx3 idx2k = idx2k3 assert 2**24 == 16777216 assert idx2k(k2idx(str("abc"))) == "abc" arraySize = 0x1000000 nSelectedFiles = len(selectedFiles) mesg("Will read %d files" % (nSelectedFiles)) #print(selectedFiles) nFilesRead = 0 # we make a first pass over the selectedFiles # keep track of Dfs for each term # choose the terms that are neither too common nor too rare # load them into a dictionary to be used in the second pass # termDfs is an integer array, one int for each POSSIBLE n-gram # if that n-gram occurs in the corpus, then the corresponding entry is # non-zero. mesg("Initializing array") termDfs = np.zeros(arraySize, dtype=np.int32) maxFreq = 2**32 - 1 # make sure termDfs is initialized to zeros assert termDfs[0] == 0 and termDfs[arraySize-1] == 0 termCount = 0 docCount = 0 for aFile in selectedFiles: docTfs = vx.dictOfFile(aFile,n) mesg("File %d/%d: read %7d %d-grams from %s" % (docCount+1, nSelectedFiles,len(docTfs.keys()), n, aFile)) docCount += 1 def updater(k): # for each term, update the termDfs # convert each n-gram to the corresponding integer idx = k2idx(k) #print("updater: found term %s" % k) nonlocal termCount # is this a new term? if termDfs[idx] == 0: termCount += 1 # for any term, new or not, increment document frequency, # unless it would cause an overflow if termDfs[idx] < maxFreq: termDfs[idx] += 1 [updater(k) for k in docTfs.keys()] # to report progress in term set after each file #mesg("Unique %d-grams seen so far: %d" % (n, termCount)) mesg("number of documents read: %d" % docCount) assert np.count_nonzero(termDfs) == termCount mesg("corpus has %d terms" % termCount) pFile = nickname+'dfs.pickle' if quick: mesg("Omit writing of %s" % pFile) else: mesg("Writing to %s" % pFile) pickle.dump([selectedFiles, termDfs], open(pFile,'wb'), pickle.HIGHEST_PROTOCOL) return termDfs def selectSomeTerms( n, aDir, selectedFiles, termDfs, pickleFile, lowThreshold, highThreshold, nickname): if n==4: k2idx = k2idx4 idx2k = idx2k4 assert 2**32 == 4294967296 assert idx2k(k2idx("abcd")) == "abcd" arraySize = 0x100000000 else: k2idx = k2idx3 idx2k = idx2k3 assert 2**24 == 16777216 assert idx2k(k2idx(str("abc"))) == "abc" arraySize = 0x1000000 # examine the termDfs vector, and for each term we want to keep # load it into the selectedTerms dictionary mesg("Using low cutoff of %d (0 means keep singletons)" % lowThreshold) nSelectedTerms = np.count_nonzero(termDfs) if lowThreshold > 0: mesg("Before removing infrequent terms: %10d" % nSelectedTerms) termDfs[np.where(termDfs <= lowThreshold)] = 0 nSelectedTerms = np.count_nonzero(termDfs) mesg("After removing infrequent terms: %10d" % nSelectedTerms) # express threshold for too common terms as a percentage # of the number of selected files highCutoff = len(selectedFiles) * highThreshold/100 mesg("Using high cutoff of %d percent (max 100 means no high cutoff)" % highThreshold) assert highThreshold <= 100 if highThreshold < 100: termDfs[np.where(termDfs > highCutoff)] = 0 nSelectedTerms = np.count_nonzero(termDfs) mesg("Number of terms to be kept: %10d" % nSelectedTerms) selectedTerms = {} def updateVocab(i, aTerm): if i % 100000 == 0: stdout.write(" %10d\r" % i) selectedTerms[aTerm] = i mesg("Adding %d terms to vocabulary" % nSelectedTerms) [updateVocab(i, idx2k(aTerm)) for i,aTerm in enumerate(np.flatnonzero(termDfs))] if (aDir[:-1] == '/'): pass1prefix = aDir[:-1] else: pass1prefix = aDir aDir = aDir+'/' pFile = aDir+nickname+'.selectedTerms.pickle' mesg("Brief delay: writing to %s" % pFile) pickle.dump([selectedTerms], open(pFile,'wb'), pickle.HIGHEST_PROTOCOL) return selectedTerms def buildTDMpass2(n, aDir, selectedFiles, selectedTerms, normalize, printTDM, pickleFile, nickname): mesg("Entering buildTDMpass2, n is %d, aDir is %s " % (n, aDir)) nSelectedFiles = len(selectedFiles) nSelectedTerms = len(selectedTerms.keys()) mesg("nSelectedFiles: %d" % nSelectedFiles) mesg("nSelectedTerms: %d" % nSelectedTerms) if normalize: mesg("TDM will be length normalized") if printTDM: mesg("TDM will be printed") # hook the read function so we can see what's happening # verbose files are file like objects, for our purpose, since # they have their own read function nFilesRead = 0 class verboseFile(io.BufferedIOBase): def __init__(self, aFile): self.pathName = aFile def read(self): nonlocal nFilesRead nFilesRead += 1 print (" File %d/%d, %s, of size %d" % (nFilesRead, nSelectedFiles, os.path.basename(self.pathName), os.path.getsize(self.pathName))) with open(self.pathName, "rb") as fh: buffer = fh.read() return buffer from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer( input = 'file', # send it list of file-like objects analyzer = 'char', ngram_range=(n,n), lowercase=False, # max_features=1000000, # use only top million features dtype=np.int64, # some counts can get big # max_df = 0.5, # discard terms that are too common # min_df = lowthreshold, # 1 means keep all terms vocabulary = selectedTerms, decode_error="ignore" ) # vectorizer returns a COO format sparse matrix batchSize = 1000 # 10 for testing, 1000 or more for serious use if nSelectedFiles > batchSize: counter = 0 remainingFiles = selectedFiles # bite off first batch of files theseFiles = remainingFiles[:batchSize] theFiles = [verboseFile(f) for f in theseFiles] # initialize the tdm tdm = vectorizer.fit_transform(theFiles) #mesg("TDM shape is %s" % str(tdm.shape) ) mesg("added %d files to tdm" % len(theseFiles)) counter += len(theseFiles) remainingFiles = remainingFiles[batchSize:] while counter < nSelectedFiles: # bite off next batch of files theseFiles = remainingFiles[:batchSize] theFiles = [verboseFile(f) for f in theseFiles] counter += len(theseFiles) # add some new rows at the bottom of the existing tdm tdm = scipy.sparse.vstack( (tdm, vectorizer.fit_transform(theFiles))) mesg("added %d files to tdm" % len(theseFiles)) remainingFiles = remainingFiles[batchSize:] else: theFiles = [verboseFile(f) for f in selectedFiles] tdm = vectorizer.fit_transform(theFiles) #mesg("TDM shape is %s" % str(tdm.shape) ) # # at this point, tdm has a ROW for each document, # and a COLUMN for each term # assert nSelectedFiles == tdm.shape[0] nTerms = tdm.shape[1] mesg("number of documents read: %d" % nSelectedFiles) mesg("number of terms observed: %d" % nTerms) # the following code assumes TDM has one row per term # and one column per document, so transpose mesg("Transposing the raw TDM") tdm = tdm.transpose() # tdm has the number of times term i occurs in document j # term i is the ith element of termSet # document j is the jth element of selectedFiles, which is sorted # selectedTerms is a hash, and its keys are sorted to create termSet mesg("Sorting the term set") termSet = sorted(selectedTerms.keys()) if printTDM: mesg("Print the raw TDM") for aRow in range(0, nTerms): print("Row %d (term \"%4s\"):\n" % (aRow, termSet[aRow]), end='') for aCol in range(0, nSelectedFiles-1): stdout.write("%3d, " % tdm[aRow, aCol]) print("%3d" % tdm[aRow, nSelectedFiles-1]) # what is the density of the matrix? # the sparse matrix package keeps count of the non-zero entries, #mesg("Number of non-zero entries in TDM: %d" % tdm.getnnz()) mesg("Density of raw TDM is %f (max is 1.0)" % (tdm.getnnz()/(nTerms * nSelectedFiles))) # convert to csr mesg("Converting raw TDM from coo to csr format") tdm = scipy.sparse.csr_matrix(tdm) tdm = tdm.asfptype() # save data structures for use in use phase mesg("Writing to %s" % pickleFile) pickle.dump([tdm, termSet, selectedFiles],open(pickleFile,'wb'), pickle.HIGHEST_PROTOCOL) elapsed_time = time.process_time() - t mesg("Finished building TDM") def useTDM(pickleFile, logDir, plotTitle, printTDM, nDims2Plot, nickname, usetfidf, uselogh): # if this fails, then plotly isn't working at all print("Using plotly version %s" % plotly.__version__) print("Reading from %s" % pickleFile) [tdm, termSet, selectedFiles] = pickle.load(open(pickleFile,'rb')) assert(tdm.getformat() == 'csr') nTerms = len(termSet) nSelectedFiles = len(selectedFiles) assert nTerms == tdm.shape[0] assert nSelectedFiles == tdm.shape[1] # get rid of files we don't want to consider assert math.log(8,2) == 3 # python knows about binary logs if usetfidf: mesg("calculating tfidf TDM") twtdm = tdm # easy way to create tfidfTDM with the right shape dfs = np.diff(tdm.indptr) # get the number of nonzeros in each row for i in range(nTerms): idf = math.log(nSelectedFiles/dfs[i], 2) twtdm[i,:] = tdm[i,:] * idf elif uselogh: mesg("calculating log entropy TDM") twtdm = scipy.sparse.lil_matrix(tdm) # change to lil format for i in range(nTerms): thisRow = np.matrix.getA1(tdm[i,:].todense()) thisRowSum = sum(thisRow) pij = thisRow if thisRowSum == 0: numer = 0 else: pij = thisRow/thisRowSum # compute binary logs of nonzero entries in pij log2pij = pij log2pij[np.nonzero(pij)] = np.log2(pij[np.nonzero(pij)]) numer = np.inner(pij, log2pij) if (i % 5000) == 0: print("progress: for term %d, numer is %f " % (i,numer) ) rowWeight = 1+(-numer/(math.log(nSelectedFiles+1))) twtdm[i,:] = np.log2(thisRow+1)*rowWeight #for j in np.nonzero(twtdm[i,:])[0]: #twtdm[i,j] = math.log(twtdm[i,j]+1,2)*rowWeight twtdm = scipy.sparse.csr_matrix(twtdm) else: twtdm = tdm # get rid of values that are too close to zero to worry about #tol = 1 #twtdm[tol > abs(twtdm)] = 0.0 #mesg("Density of term-weighted TDM (tol of %f) is %f (max is 1.0)" % # (tol, twtdm.getnnz()/(nTerms * nSelectedFiles))) # if we're to subtract the centroid, here would be the place # there's a function for that in vxlib #twtdm = scipy.sparse.csr_matrix(twtdm) # change back to csr format assert(twtdm.getformat() == 'csr') # keep only those rows that have non-zero entries # since rows of zeros make svds unhappy # thanks to a great stack overflow post for this idea oldR, oldC = twtdm.shape mesg("Old TWM has %d rows and %d columns" % (oldR, oldC)) num_nonzeros = np.diff(twtdm.indptr) newtdm = twtdm[ num_nonzeros != 0 ] newR, newC = newtdm.shape mesg("After removing all-zero rows, TWM has %d rows and %d columns" % (newR, newC)) # if any columns (documents) are now all zero, inform the user emptyCols = newtdm.getnnz(0) == 0 if np.sum(emptyCols) > 0: mesg("Careful! Some (%d) documents are empty:" % np.sum(emptyCols)) print(np.array(selectedFiles)[ emptyCols ]) # we have the TDM, so we can do an isomap if we wish # but for now save some memory del tdm # maybe we want to print the term-weighted TDM if printTDM: mesg("Print the term-weighted TDM") for aRow in range(nTerms): print("Row %d (term \"%4s\"):\n" % (aRow, termSet[aRow]), end='') for aCol in range(0, nSelectedFiles-1): stdout.write("%4f, " % twtdm[aRow, aCol]) print("%4f" % twtdm[aRow, nSelectedFiles-1]) mesg("Computing sparse svd") U, s, Vh = svds(twtdm, min(6, min(twtdm.shape)-1) ) # svds returns s in ascending order, not descending s = sorted(s,reverse=True) mesg("Singular values:") print(s) # the top r rows of Vh are the coordinates of the documents # in the vector space spanned by the leftmost r columns of U. # the entries in s are scaling factors # maybe multiplying by the singular values isn't necessary... xs = Vh[0,] ys = Vh[1,] zs = Vh[2,] qs = Vh[3,] #xs = s[0]*Vh[0,] #ys = s[1]*Vh[1,] #zs = s[2]*Vh[2,] #qs = s[3]*Vh[3,] # for color names see # http://www.w3schools.com/colors/colors_names.asp # loop to change all GameThief to red colorVec = ['no default color'] * len(selectedFiles) plotMe = np.ones(len(selectedFiles)) for idx,aFile in enumerate(selectedFiles): if nickname == "vx": if 'Rootkit' in aFile: colorVec[idx] = 'Brown' elif 'KeyLogger' in aFile: colorVec[idx] = 'Green' else: colorVec[idx] = 'Linen' plotMe[idx] = 0 elif nickname == "ek": # see if we can find a logFile where this # specimen was labeled with an EK logFile = logDir+os.path.basename(aFile)+"/fast.log" # reminds us of this useful command # find . -name fast.log -exec grep 'Redkit' {} \; try: with open(logFile,'r') as f: def checkKit(marker, chosenColor): nonlocal colorVec #print("Found %s alert in fast.log for file %s:" % # (marker,aFile)) colorVec[idx] = chosenColor buffer = f.read() if "Angler" in buffer: checkKit("Angler",'salmon') elif "Blackhole" in buffer: checkKit("Blackhole",'black') #elif "Magnitude" in buffer: # checkKit("Magnitude",'brown') #elif "Neutrino" in buffer: # checkKit("Neutrino",'purple') #elif "Nuclear" in buffer: # checkKit("Nuclear",'blue') elif "Redkit" in buffer: checkKit("Redkit",'red') #elif "Redkit/Sakura" in buffer: # checkKit("Redkit",'pink') # one Sweet Orange, in # englishrussia.com-2014-04-16-0400.pcap # check for a single word since a line break may happen elif "Orange" in buffer: #print("Found Sweet Orange in pcap %s" % aFile) checkKit("Sweet Orange",'orange') elif "exploit kit" in buffer: checkKit("unknown",'blue') else: # linen makes a nice default color colorVec[idx] = 'Linen' except IOError as e: print("Cannot open %s for reading" % logFile) elif nickname == "ws": if 'Verona' in aFile: colorVec[idx] = 'rgba(200, 50, 50, .9)' elif ('Henry' in aFile): colorVec[idx] = 'Green' else: # linen makes a nice default color colorVec[idx] = 'Linen' elif nickname == "NT": if 'matt' in aFile: colorVec[idx] = 'Purple' elif 'mark' in aFile: colorVec[idx] = 'Scarlet' elif 'luke' in aFile: colorVec[idx] = 'Green' elif 'john' in aFile: colorVec[idx] = 'Blue' elif 'heb' in aFile: colorVec[idx] = 'Salmon' else: colorVec[idx] = 'Linen' else: mesg("unknown nickname, no colors assigned") # option to not plot points with default color # making points of interest easier to see, and webgl happier plotAll = True if plotAll: x = xs y = ys z = zs theseFiles = selectedFiles theseColors = colorVec else: plotThese = np.nonzero(plotMe)[0] print("size of plotThese is %s" % len(plotThese)) x = xs[plotThese] y = ys[plotThese] z = zs[plotThese] theseFiles = ["no such file"] * len(plotThese) theseColors = ['no default color'] * len(plotThese) for j, idx in enumerate(plotThese): theseFiles[j] = selectedFiles[idx] theseColors[j] = colorVec[idx] if nDims2Plot == 3: # from the plotly web page trace1 = go.Scatter3d( x=x, y=y, z=z, mode='markers', marker=dict( size=12, color=theseColors, opacity = 0.8 ), text=theseFiles ) else: # it must be a 2d plot trace1 = go.Scatter( x=x, y=y, #z=z, mode='markers', marker=dict( size=12, color=theseColors, opacity = 0.8 ), text=theseFiles ) data = [trace1] layout = go.Layout( title = plotTitle, hovermode='closest', showlegend=False, margin=dict( l=0, r=0, b=0, t=0 ) ) fig = go.Figure(data=data, layout=layout) plotly.offline.plot(fig, filename='plots/'+plotTitle+'.html') def main(argv): parser = argparse.ArgumentParser() # # must specify the corpus nickname and the pickle file, in either -b or -v # parser.add_argument("-c","--corpus", nargs=1, help="nickname for corpus", dest="nickname", default=["no such corpus nickname"]) parser.add_argument("-p","--pickleFile", nargs=1, help="pickle file to use", dest="pickleFile", default=["no pickle file"]) # # use for build phase. either -b or -v is required. # parser.add_argument("-b","--build", help="read raw data and write pickle file", default=False, action='store_true') parser.add_argument("-d","--directory", nargs=1, help="directory for raw data", dest="aDir", default=["/media/data/parse_data/Javascript/_data/"]) parser.add_argument("-high","--highthreshold", nargs=1, dest="highthreshold", help="discard terms that occur too often", # the default is remove terms that occur in # more than this percentage of the files type=int, default=[100]) # default threshold of 1 keeps all terms parser.add_argument("-low","--lowthreshold", nargs=1, dest="lowthreshold", help="discard terms that occur less often than this", type=int, default=[1]) parser.add_argument("-m","--match", nargs=1, help="use files that match a pattern", dest="matchText", default=[""]) parser.add_argument("-n","--ngramLen", nargs=1, dest="ngramLen", help="length of ngram", type=int, default=[4]) parser.add_argument("-norm","--normalize", help="normalize columns in TDM", default=False, action='store_true') parser.add_argument("-q","--quick", help="skip saving the Dfs", default=False, action='store_true') parser.add_argument("-pr","--printTDM", help="print the TDM", default=False, action='store_true') parser.add_argument("-s","--sample", nargs=1, dest="sampleFactor", help="sample using one out of so many files", type=int, default=[1]) parser.add_argument("-skip","--skipPass1", help="skip pass 1, read Dfs from dfs.pickle", default=False, action='store_true') parser.add_argument("-z","--zipfile", nargs=1, help="zip file for raw data", dest="aZipfile", default=["/media/data/parse_data/Javascript/aZipfile"]) # # use with visualize phase. either -b or -v is required # parser.add_argument("-v","--visualize", help="read pickle file and make 2D or 3D plot", default=False, action='store_true') parser.add_argument("-2d","--plot2d", help="specify 2D plot", default=False, action='store_true') parser.add_argument("-3d","--plot3d", help="specify 3D plot", default=True, action='store_true') parser.add_argument("-logh","--uselogh", help="use log entropy term weighting", default=False, action='store_true') parser.add_argument("-tfidf","--usetfidf", help="use tf.idf term weighting", default=False, action='store_true') parser.add_argument("-l","--logs", nargs=1, help="directory for log data", dest="logDir", default=["/media/data/parse_data/out/"]) parser.add_argument("-t","--title", nargs=1, help="Title to use for plot", dest="plotTitle", default=["Default title"]) args = parser.parse_args() print(datetime.datetime.now()) if args.pickleFile[0] == "no pickle file": args.pickleFile[0] = args.nickname[0]+".pickle" if args.build: # # build phase, where raw data is read and pickle file # with all the terms as written # followed by pickle file with # print("Using n= %d" % args.ngramLen[0]) print("Corpus nickname (should make sense with directory): %s" % args.nickname[0]) # if -z is specified, set up to read from that instead of # -d conflicts with -z print("Data directory: %s" % args.aDir[0]) if args.normalize: print("Data objects will be length-normalized") print("pickleFile is", args.pickleFile[0]) print("Sampling factor: %d" % args.sampleFactor[0]) if not args.skipPass1: selectedFiles = selectSomeFiles( args.aDir[0], args.matchText[0], args.sampleFactor[0], args.nickname[0] ) #print(selectedFiles) termDfs = buildTDMpass1( args.ngramLen[0], args.aDir[0], selectedFiles, args.pickleFile[0], args.nickname[0], args.quick ) else: pFile = args.nickname[0]+".dfs.pickle" mesg("Reading selectedFiles and termDfs from "+pFile) [selectedFiles, termDfs] = pickle.load( open(pFile,'rb')) selectedTerms = selectSomeTerms( args.ngramLen[0], args.aDir[0], selectedFiles, termDfs, args.pickleFile[0], args.lowthreshold[0], args.highthreshold[0], args.nickname[0] ) #print(selectedTerms) tdm = buildTDMpass2( args.ngramLen[0], args.aDir[0], selectedFiles, selectedTerms, args.normalize, args.printTDM, args.pickleFile[0], args.nickname[0] ) mesg("built TDM") # # write the pickle file with the TDM # if args.visualize: # # visualize (or otherwise work with) data in pickle file # print("pickleFile is", args.pickleFile[0]) print("Plot title will be %s" % args.plotTitle[0]) if args.plot2d: nDims2Plot = 2 print("Making a 2-d plot") else: nDims2Plot = 3 print("Making a 3-d plot") useTDM(args.pickleFile[0], args.logDir[0], args.plotTitle[0], args.printTDM, nDims2Plot, args.nickname[0], args.usetfidf,args.uselogh) print("That's all, folks!") print(datetime.datetime.now()) exit() main(sys.argv)