package org.java.evolutionary.sequence; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.InputStreamReader; import java.io.FileWriter; import java.io.BufferedWriter; import java.io.OutputStreamWriter; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.StringTokenizer; import java.util.regex.*; import org.apache.commons.io.FileUtils; import org.biojava.bio.seq.DNATools; import org.biojava.bio.seq.Sequence; import org.biojava.bio.seq.SequenceIterator; import org.biojava.bio.seq.io.SeqIOTools; import org.biojava.bio.symbol.FiniteAlphabet; import org.biojava.bio.symbol.SymbolList; import org.biojava.utils.regex.Matcher; import org.biojava.utils.regex.Pattern; import org.biojava.utils.regex.PatternFactory; /** * This code is meant to read the Features stored in file (hall of fame output) * and generate LibSVM specific format files. It will also output the features * separtely for those who like to study the features. The features are output * to file SSCleanFeatures.txt. * This does parallelization of computing feature matching. The parallelization * is controlled by input argument of Threads. It will use chunking i.e total number * of sequence % threads-1 will get equal share and the last one will get all * the sequences in case of odd/even distribution. Number of threads used should be * equal to number of cores/processors on machines for faster throughput. * Another thing this class does is it does some simple simplification like * if there is (AND true true) etc or (OR (NOT false)) etc will be reduced. * In future we can even reduce the features to remove redundancy like * matchesAtPosition motif3 AGT @ 45 AND matchesAtPosition motif1 T at 47 * but have to think through whether redundancy (bloat) can be good/bad in some cases. * @author udaykamath * */ public class HypersensitiveSequenceFeatureInterpreter { public boolean cleanOnly; // holds sequences in memory ArrayList sequencesList = new ArrayList(); // holds the corresponding labels in memroy ArrayList labels = new ArrayList(); public static PatternFactory factory; ArrayList featureTrees = new ArrayList(); BufferedWriter writer = null; protected static ArrayList positiveSequencesList = new ArrayList(); protected static ArrayList negativeSequencesList = new ArrayList(); private int[] fi = { 0 }; // feature index private void initWriter(String outputFile){ try{ writer = new BufferedWriter(new OutputStreamWriter( new FileOutputStream(outputFile, true))); }catch(Exception e){ e.printStackTrace(); } } /** * All threads use this, so better be synchronized * @param data * @throws Exception */ public void quickWrite(String data) throws Exception { synchronized(writer){ writer.write(data); } } public void close() throws Exception { writer.flush(); writer.close(); } public void generateLibSVMFile(File gpFile,int threads ) throws Exception { // this is file that has features as graphs/trees ArrayList featureStrings = new ArrayList(); ArrayList svmLines = new ArrayList(); try { BufferedReader r = new BufferedReader(new InputStreamReader( new FileInputStream(gpFile))); String line = r.readLine(); FTNode ftn; int k = 0; while (line != null) { fi[0] = 0; System.out.println("Loading: " + line); ftn = FTNode.load_tree(line, fi, factory); if (ftn != null) { ftn.cleanse(); if (featureStrings.contains(ftn.toString()) == false) { featureTrees.add(ftn); featureStrings.add(ftn.toString()); System.out.println("Loaded feature tree:" + k++); } } else System.out.println("Couldn't read feature tree "+k); line = r.readLine(); } //File cleanFeatures = new File("./HSfeaturesClean.txt"); //FileUtils.writeLines(cleanFeatures, featureTrees); //System.out.println("Clean features written..."); } catch (Exception e) { e.printStackTrace(); } if (cleanOnly == true) { System.out.println("Done."); return; } int index =0; int chunkSize = this.sequencesList.size()/threads; int threadCount =0; ArrayList listOfWorkers = new ArrayList(); while(index < this.sequencesList.size()){ int count = sequencesList.size() - index > chunkSize ? chunkSize : sequencesList.size() - index; int start = index; int end = start + count -1; ThreadWorker worker = new ThreadWorker("Thread"+threadCount,start,end); worker.start(); threadCount = threadCount +1; index += chunkSize; listOfWorkers.add(worker); } //join threads for(int i=0 ;i< threadCount; i++) listOfWorkers.get(i).join(); //say done close(); } /** * This method reads the sequences from File with labels +1, -1 and tries to * put it in right buckets. It also initializes factor for IUPAC parsing etc * * @param fileName */ public void setup(String positiveFileName, String negativeFileName) { BufferedReader positiveBuffer; BufferedReader negativeBuffer; try { // read the files positiveBuffer = new BufferedReader( new FileReader(positiveFileName)); negativeBuffer = new BufferedReader( new FileReader(negativeFileName)); SequenceIterator positiveFileIterator = (SequenceIterator) SeqIOTools .fileToBiojava("fasta", "DNA", positiveBuffer); SequenceIterator negativeFileIterator = (SequenceIterator) SeqIOTools .fileToBiojava("fasta", "DNA", negativeBuffer); this.countTotalSequence(positiveFileIterator, true); this.countTotalSequence(negativeFileIterator, false); this.sequencesList = new ArrayList(); this.labels = new ArrayList(); //iterate positive sequences for(int i=0; i< positiveSequencesList.size(); i++){ this.sequencesList.add(positiveSequencesList.get(i).seqString()); this.labels.add("1"); } //iterate positive sequences for(int i=0; i< negativeSequencesList.size(); i++){ this.sequencesList.add(negativeSequencesList.get(i).seqString()); this.labels.add("-1"); } } catch(Exception e){ e.printStackTrace(); } // create IUPAC reader FiniteAlphabet iupac = DNATools.getDNA(); factory = PatternFactory.makeFactory(iupac); } private int countTotalSequence(SequenceIterator iterator, boolean positive) { int total = 0; try { while (iterator.hasNext()) { Sequence seq = iterator.nextSequence(); total = total + 1; if (positive) this.positiveSequencesList.add(seq); else this.negativeSequencesList.add(seq); } } catch (Exception e) { e.printStackTrace(); } return total; } public static void main(String[] args) { try { if(args.length < 4){ System.err.println("ThreadedSequenceFeatureInterpreter hallOFFameFeaturesFile outputSVMFile threads positive negative"); System.exit(-1); } File f = new File(args[0]); ThreadedSequenceFeatureInterpreter gen = new ThreadedSequenceFeatureInterpreter(); String outputFile = args[1]; gen.initWriter(outputFile); String threadString = args[2]; gen.setup(args[3], args[4]); int threads = new Integer(threadString).intValue(); gen.generateLibSVMFile(f,threads); } catch (Exception e) { e.printStackTrace(); } } /** * Threaded Implementation to compute SVM Feature Matching * @author udaykamath * */class ThreadWorker extends Thread{ int myStart; int myEnd; ThreadWorker(String name, int start, int end){ super(name); this.myEnd = end; this.myStart = start; } public void run(){ for(int k=myStart; k<= myEnd ; k++){ //get the sequence String sequence = (String)sequencesList.get(k); // get the label String label = (String) labels.get(k); String featureString = label; for (int i = 0; i < featureTrees.size(); i++) { SymbolList currentSequence = null; try { currentSequence = DNATools.createDNA(sequence); } catch (Exception e) { e.printStackTrace(); } // measure the match, create sparse representation if (((FTNode) featureTrees.get(i)).value(currentSequence)) { featureString = featureString + " " + (i + 1) + ":" + 1; } } featureString = featureString + "\n"; try{ quickWrite(featureString); System.out.println("completed sequence:" + k); } catch (Exception e){ e.printStackTrace(); } } } } }