get paid to paste

Suffix Tree Implementation using Suffix Array...

import java.io.*;
import java.util.*;
import java.awt.*;
import java.math.*;

public class Main {
	static PrintWriter out = new PrintWriter(new BufferedOutputStream(System.out) );
	static BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
	static StringTokenizer st = new StringTokenizer("");
	static String next() throws Exception {

		while (!st.hasMoreTokens()) {
			String s = br.readLine();
			if (s == null)
				return null;
			st = new StringTokenizer(s);
		}
		return st.nextToken();
	}

	public static void main(String[] asda) throws Exception {
		
		String s = "ABCBCAB$";
		s = "abaabbaaa$".toUpperCase();
		
	//	s = "ABAA$";
	//	s = "AAAB$";
	//	s = "ALEXXELA$";
		SuffixArray sa = new SuffixArray(s);
		SuffixTree st = new SuffixTree(s, sa.pos, sa.LCP);
		
		// tests if all suffixes are in tree
		for (int i = 0; i < s.length(); i++)	{
			String sub = s.substring(i);
			out.println( sub + " in tree?: " + st.contains(sub) );
		}
		
		//
		out.flush();
		System.exit(0);
	}

	static class SuffixTree	{
		
		static final int ALPH_SIZE = 26 + 26 + 1;	// how many symbols are admitted? + terminator symbol ($)
		
		// given a character, returns its index for suffix node
		// this function should be updated if we change the alphabet
		static int char2id(char ch)	{
			if	( ch == '$' )	return	0;	// end symbol
			if	( ch <= 'Z' )	return	ch - 'A' + 1;	// upper case
			return	ch - 'a' + 26 + 1;	// lower case
		}
		
		/**
		 * Node for suffix tree
		**/
		class SuffixNode	{
			SuffixNode sons [];	// who are sons of this node?
			SuffixNode dad;		// who is dad of this node?, null if this node is root
			int startIndex;		// where this suffix starts? substring(index)	- inclusive
			int endIndex = -1;	// where this suffix ends? substring(index, len + 1)	- inclusive
			int chars = 0;	// how many chars are between root and dad?
			
			// create a node given his dad
			SuffixNode(SuffixNode dad)	{
				sons = new SuffixNode [ ALPH_SIZE ];
				this.dad = dad;
			}
			
			/*
			 * insert idx-suffix ( TEXT.substring(idx) )
			 * only useful if the new node doesn't exists
			**/
			SuffixNode insert(int idx)	{
				if	( idx >= TEXT.length() )	return	null;	// avoid bad indexes
				
				int ch = char2id( TEXT.charAt(idx) );	// id of new node
				SuffixNode node = sons[ch];
				if	( node == null )	{
					// if node doesn't exists, insert the whole range only
					node = sons[ch] = new SuffixNode(this);
					
					// insert TEXT.substring(idx)
					node.startIndex = idx;
					node.endIndex = TEXT.length() - 1;
					
					// update char count between root and dad
					node.chars = this.chars + size();
				}
				return	node;
			}

			/**
			 * insert a new node ( TEXT.substring(idx) )
			 * lcp is the longest common prefix between this and idx-th suffix
			**/
			SuffixNode insert(int idx, int lcp)	{
				// update lcp and idx (if need it)
				lcp -= chars;
				idx += chars;
				
				// trim right part and add it like a son of this node (if need it)
				if	( startIndex + lcp <= endIndex )	{
					// right suffix trimmed
					int id = char2id( TEXT.charAt( startIndex + lcp ) );

					// like inesrt(idx)
					SuffixNode node = new SuffixNode(this);
					node.startIndex = startIndex + lcp;
					node.endIndex = endIndex;	// only changed this
					node.chars = chars + lcp;
					
					if	( sons[id] == null )	{
						// if node doesn't overlap with a son, only add it
						if	( node.size() != 0 )	{
							SuffixNode [] aux = node.sons;
							node.sons = sons;
							sons = aux;
							
							sons[id] = node;
						}
					}
					else	{
						// the node overlap with a son
						// replace that son with an intermediate node (node)
						SuffixNode [] aux = node.sons;
						node.sons = sons;
						sons = aux;
						
						sons[id] = node;
					}
				}
				
				// trim this node to lcp
				endIndex = startIndex + lcp - 1;
				
				// insert left suffix
				return	insert(lcp + idx);
			}
			
			
			/**
			 * compute longest common prefix between TEXT.substring(i) and TEXT.substring(k, kk + 1)
			**/
			int lcp(int i, int k, int kk)	{
				int ans = 0;
				
				while	( i < TEXT.length() && k <= kk && TEXT.charAt(i++) == TEXT.charAt(k++) )
					ans++;
				
				return	ans;
			}
			
			/**
			 * size of this suffix
			**/
			int size()	{
				return	endIndex - startIndex + 1;
			}
			
			
			/*
			 * String representation of this suffix (empty if root)
			**/
			public String toString()	{
				String s = TEXT.substring(startIndex, endIndex + 1);
				return	s;
			}
			
		}
		
		// root of suffix tree
		SuffixNode	root;
		
		// text for this suffix tree
		final String TEXT;
		
		/**
		 * create suffix tree for string str using its suffix array and longest common prefix array
		 * O(N logN)
		 * @author Alexis Hernández
		 */
		SuffixTree(String str, Integer [] sa, int [] LCP)	{
			int N = LCP.length;
			this.TEXT = str;
			
			root = new SuffixNode(null);
			
			// insert $
			SuffixNode node = root.insert( sa[0] );
			
			// test
			int i = 1;
			while	( i < N )	{
				
				if	( LCP[i] == 0 )	{
					// here starts a new bucket, insert it into root only
					node = root.insert( sa[i++] );
					continue;
				}
				
				// find node with sharing part of LCP
				while	( node.chars >= LCP[i]	)	{
					node = node.dad;
				}
				
				// insert this suffix into found node
				node = node.insert( sa[i], LCP[i] );
				
				i++;
			}
		}
		
		/**
		 * tests if this suffix tree contains s
		**/
		public boolean contains(String s)	{
			SuffixNode node = root;
			int i = 0;
			while	( i < s.length() )	{
				node = node.sons[ char2id( s.charAt(i) ) ];
				if	( node == null )
					return	false;
				
				for (int k = node.startIndex; i < s.length() && k <= node.endIndex; k++, i++)
					if	( s.charAt(i) != TEXT.charAt(k) )
						return	false;
			}
			return	true;
		}
	}
	
	
	static class SuffixArray {
		final String str; // input
		int N;
		Integer [] pos;	// output
		int [] rank;	// output
		int LCP [];		// LCP[i] = Longest common prefix of suffix pos[i] and suffix pos[i - 1]

		public SuffixArray(final String str) {
			this.str = str;
			N = str.length();

			pos = new Integer[N];
			rank = new int[N];
			int [] cnt = new int[N];
			int [] next = new int[N];
			boolean [] bh = new boolean[N];
			boolean [] b2h = new boolean[N];

			// sort suffixes according to their first characters
			for (int i = 0; i < N; i++)
				pos[i] = i;

			Arrays.sort(pos, new Comparator<Integer>() {
				public int compare(Integer a, Integer b) {
					return str.charAt(a) - str.charAt(b);
				}
			});
			// {pos contains the list of suffixes sorted by their first
			// character}

			bh[0] = true;
			for (int i = 1; i < N; ++i) {
				bh[i] = str.charAt(pos[i]) != str.charAt(pos[i - 1]);
			}

			for (int h = 1; h < N; h <<= 1) {
				// {bh[i] == false if the first h characters of pos[i-1] == the
				// first h characters of pos[i]}
				int buckets = 0;
				for (int i = 0, j; i < N; i = j) {
					j = i + 1;
					while (j < N && !bh[j])
						j++;
					next[i] = j;
					buckets++;
				}
				if (buckets == N)
					break; // We are done! Lucky bastards!
				// {suffixes are separted in buckets containing strings starting
				// with the same h characters}

				for (int i = 0; i < N; i = next[i]) {
					cnt[i] = 0;
					for (int j = i; j < next[i]; ++j) {
						rank[pos[j]] = i;
					}
				}

				cnt[rank[N - h]]++;
				b2h[rank[N - h]] = true;
				for (int i = 0; i < N; i = next[i]) {
					for (int j = i; j < next[i]; ++j) {
						int s = pos[j] - h;
						if (s >= 0) {
							int head = rank[s];
							rank[s] = head + cnt[head]++;
							b2h[rank[s]] = true;
						}
					}
					for (int j = i; j < next[i]; ++j) {
						int s = pos[j] - h;
						if (s >= 0 && b2h[rank[s]]) {
							for (int k = rank[s] + 1; k < N && !bh[k] && b2h[k]; k++)
								b2h[k] = false;
						}
					}
				}
				for (int i = 0; i < N; ++i) {
					pos[rank[i]] = i;
					bh[i] |= b2h[i];
				}
			}
			

			// compute inverse of suffix array
			for (int i = 0; i < N; ++i)
				rank[ pos[i] ] = i;
			
			/**
			 * Begin of the O(n) longest common prefix algorithm
			 * Refer to "Linear-Time Longest-Common-Prefix Computation in Suffix
			 * Arrays and Its Applications" by Toru Kasai, Gunho Lee, Hiroki
			 * Arimura, Setsuo Arikawa, and Kunsoo Park
			**/

			LCP = new int [N];
			LCP[0] = 0;
			for (int i = 0, h = 0; i < N; ++i)	{
				if ( rank[i] > 0 )	{
					int j = pos[ rank[i] - 1 ];
					while ( i + h < N && j + h < N && str.charAt(i+h) == str.charAt(j+h) )
						h++;
					LCP[ rank[i] ] = h;
					if ( h > 0 )
						h--;
				}
			}
		}

	}
}

Pasted: May 11, 2014, 10:45:15 pm
Views: 28