001/* Copyright (C) 2013 TU Dortmund
002 * This file is part of AutomataLib, http://www.automatalib.net/.
003 * 
004 * AutomataLib is free software; you can redistribute it and/or
005 * modify it under the terms of the GNU Lesser General Public
006 * License version 3.0 as published by the Free Software Foundation.
007 * 
008 * AutomataLib is distributed in the hope that it will be useful,
009 * but WITHOUT ANY WARRANTY; without even the implied warranty of
010 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
011 * Lesser General Public License for more details.
012 * 
013 * You should have received a copy of the GNU Lesser General Public
014 * License along with AutomataLib; if not, see
015 * http://www.gnu.de/documents/lgpl.en.html.
016 */
017package net.automatalib.words;
018
019import java.io.IOException;
020import java.util.AbstractList;
021import java.util.Collections;
022import java.util.List;
023import java.util.Objects;
024
025import net.automatalib.commons.util.array.ArrayWritable;
026import net.automatalib.commons.util.strings.AbstractPrintable;
027import net.automatalib.commons.util.strings.StringUtil;
028
029/**
030 * A word is an ordered sequence of symbols. {@link Word}s are generally immutable,
031 * i.e., a single {@link Word} object will never change (unless symbol objects are modified,
032 * which is however highly discouraged).
033 * <p>
034 * This class provides the following static methods for creating words in the most common scenarios:
035 * <ul>
036 * <li> {@link #epsilon()} returns the empty word of length 0
037 * <li> {@link #fromLetter(Object)} turns a single letter into a word of length 1
038 * <li> {@link #fromSymbols(Object...)} creates a word from an array of symbols
039 * <li> {@link #fromArray(Object[], int, int)} creates a word from a subrange of a symbols array
040 * <li> {@link #fromList(List)} creates a word from a {@link List} of symbols
041 * </ul>
042 * <p>
043 * Modification operations like {@link #append(Object)} or {@link #concat(Word...)} create
044 * new objects, subsequently invoking these operations on the respective objects returned is
045 * therefore highly inefficient. If words need to be dynamically created, a {@link WordBuilder}
046 * should be used.
047 * <p>
048 * This is an abstract base class for word representations. Implementing classes only need to implement
049 * <ul>
050 * <li> {@link #getSymbol(int)}
051 * <li> {@link #length()}
052 * </ul>
053 * <p>
054 * However, for the sake of efficiency it is highly encouraged to overwrite the other methods
055 * as well, providing specialized realizations.
056 *
057 * @param <I> symbol class
058 * 
059 * @author Malte Isberner <malte.isberner@gmail.com>
060 */
061public abstract class Word<I> extends AbstractPrintable implements ArrayWritable<I>, Iterable<I> {
062        
063        
064        /**
065         * Retrieves the empty word.
066         * @return the empty word.
067         * @see Collections#emptyList()
068         */
069        @SuppressWarnings("unchecked")
070        public static <I> Word<I> epsilon() {
071                return (Word<I>)(Word<?>)EmptyWord.INSTANCE;
072        }
073        
074        /**
075         * Constructs a word from a single letter.
076         * @param letter the letter
077         * @return a word consisting of only this letter
078         */
079        public static <I> Word<I> fromLetter(I letter) {
080                return new LetterWord<>(letter);
081        }
082        
083        /**
084         * Creates a word from an array of symbols.
085         * @param symbols the symbol array
086         * @return a word containing the symbols in the specified array
087         */
088        @SafeVarargs
089        public static <I> Word<I> fromSymbols(I ...symbols) {
090                if(symbols.length == 0)
091                        return epsilon();
092                if(symbols.length == 1)
093                        return fromLetter(symbols[0]);
094                Object[] array = new Object[symbols.length];
095                System.arraycopy(symbols, 0, array, 0, symbols.length);
096                return new SharedWord<>(symbols);
097        }
098        
099        /**
100         * Creates a word from a subrange of an array of symbols. Note that to
101         * ensure immutability, internally a copy of the array is made.
102         * @param symbols the symbols array
103         * @param offset the starting index in the array
104         * @param length the length of the resulting word (from the starting index on)
105         * @return the word consisting of the symbols in the range
106         */
107        public static <I> Word<I> fromArray(I[] symbols, int offset, int length) {
108                if(length == 0)
109                        return epsilon();
110                if(length == 1)
111                        return fromLetter(symbols[offset]);
112                Object[] array = new Object[length];
113                System.arraycopy(symbols, offset, array, 0, length);
114                return new SharedWord<>(symbols);
115        }
116        
117        /**
118         * Creates a word from a list of symbols
119         * @param symbolList the list of symbols
120         * @return the resulting word
121         */
122        public static <I> Word<I> fromList(List<? extends I> symbolList) {
123                int siz = symbolList.size();
124                if(siz == 0)
125                        return epsilon();
126                if(siz == 1)
127                        return Word.<I>fromLetter(symbolList.get(0));
128                return new SharedWord<>(symbolList);
129        }
130        
131        public static Word<Character> fromString(String str) {
132                int len = str.length();
133                Character[] chars = new Character[str.length()];
134                for(int i = 0; i < len; i++)
135                        chars[i] = Character.valueOf(str.charAt(i));
136                return new SharedWord<>(chars);
137        }
138        
139        /*
140         * General word iterator
141         */
142        private class Iterator implements java.util.Iterator<I> {
143
144                private int index = 0;
145                
146                /*
147                 * (non-Javadoc)
148                 * @see java.util.Iterator#hasNext()
149                 */
150                @Override
151                public boolean hasNext() {
152                        return (index < Word.this.length());
153                }
154
155                /*
156                 * (non-Javadoc)
157                 * @see java.util.Iterator#next()
158                 */
159                @Override
160                public I next() {
161                        return Word.this.getSymbol(index++);
162                }
163                
164                /*
165                 * (non-Javadoc)
166                 * @see java.util.Iterator#remove()
167                 */
168                @Override
169                public void remove() {
170                        throw new UnsupportedOperationException();
171                }
172        }
173        
174        /*
175         * Representing a word as a list.
176         */
177        private class AsList extends AbstractList<I> {
178                
179                /*
180                 * (non-Javadoc)
181                 * @see java.util.AbstractList#get(int)
182                 */
183                @Override
184                public I get(int index) {
185                        return Word.this.getSymbol(index);
186                }
187
188                /*
189                 * (non-Javadoc)
190                 * @see java.util.AbstractCollection#size()
191                 */
192                @Override
193                public int size() {
194                        return Word.this.length();
195                }
196                
197                /*
198                 * (non-Javadoc)
199                 * @see java.util.AbstractList#iterator()
200                 */
201                @Override
202                public java.util.Iterator<I> iterator() {
203                        return Word.this.iterator();
204                }
205        }
206
207    /**
208     * Return symbol that is at the specified position
209     * @param i the position
210     * @return symbol at position i, <tt>null</tt> if no such symbol exists
211     */
212    public abstract I getSymbol(int index);
213    
214    /**
215     * Retrieves the length of this word.
216     * @return the length of this word.
217     */
218    public abstract int length();
219    
220    /*
221     * (non-Javadoc)
222     * @see java.lang.Object#hashCode()
223     */
224    @Override
225    public int hashCode() {
226        int hash = 5;
227        for(I sym : this) {
228                hash *= 89;
229                hash += (sym != null) ? sym.hashCode() : 0;
230        }
231        return hash;
232    }
233    
234    /*
235     * (non-Javadoc)
236     * @see java.lang.Object#equals(java.lang.Object)
237     */
238    @Override
239    public boolean equals(Object other) {
240        if(this == other)
241                return true;
242        if(other == null)
243                return false;
244        if(!(other instanceof Word))
245                return false;
246        Word<?> otherWord = (Word<?>)other;
247        int len = otherWord.length();
248        if(len != length())
249                return false;
250        java.util.Iterator<I> thisIt = iterator();
251        java.util.Iterator<?> otherIt = otherWord.iterator();
252        while(thisIt.hasNext()) {
253                I thisSym = thisIt.next();
254                Object otherSym = otherIt.next();
255                if(!Objects.equals(thisSym, otherSym))
256                        return false;
257        }
258        return true;
259    }
260    
261    /*
262     * (non-Javadoc)
263     * @see net.automatalib.commons.util.strings.Printable#print(java.lang.Appendable)
264     */
265    @Override
266        public void print(Appendable a) throws IOException {
267                StringUtil.appendIterable(a, this, " ");
268        }
269
270        /*
271     * (non-Javadoc)
272     * @see net.automatalib.commons.util.array.ArrayWritable#size()
273     */
274    @Override
275    public final int size() {
276        return length();
277    }
278    
279    
280    
281    /**
282     * Retrieves a word representing the specified subrange of this word.
283     * As words are immutable, this function usually can be realized quite efficient
284     * (implementing classes should take care of this).
285     * 
286     * @param fromIndex the first index, inclusive.
287     * @param toIndex the last index, exclusive.
288     * @return the word representing the specified subrange.
289     */
290    public final Word<I> subWord(int fromIndex, int toIndex) {
291        if(fromIndex < 0 || toIndex < fromIndex || toIndex > length())
292                throw new IndexOutOfBoundsException("Invalid subword range [" + fromIndex + ", " + toIndex + ")");
293        
294        return _subWord(fromIndex, toIndex);
295    }
296   
297    /**
298     * Retrieves the subword of this word starting at the given index and extending
299     * until the end of this word. Calling this method is equivalent to calling
300     * <pre>w.subWord(fromIndex, w.length())</pre>
301     * @param fromIndex the first index, inclusive
302     * @return the word representing the specified subrange
303     */
304    public final Word<I> subWord(int fromIndex) {
305        if(fromIndex <= 0) {
306                if(fromIndex == 0)
307                        return this;
308                throw new IndexOutOfBoundsException("Invalid subword range [" + fromIndex + ",)");
309        }
310        return _subWord(fromIndex, length());
311    }
312    
313    /**
314     * Internal subword operation implementation. In contrast to {@link #subWord(int, int)},
315     * no range checks need to be performed. As this method is flagged as <tt>protected</tt>,
316     * implementations may rely on the specified indices being valid.
317     * @param fromIndex the first index, inclusive (guaranteed to be valid)
318     * @param toIndex the last index, exclusive (guaranteed to be valid)
319     * @return the word representing the specified subrange
320     */
321    protected Word<I> _subWord(int fromIndex, int toIndex) {
322        int len = toIndex - fromIndex;
323        Object[] array = new Object[len];
324        writeToArray(fromIndex, array, 0, len);
325        return new SharedWord<>(array);
326    }
327    
328    /*
329     * (non-Javadoc)
330     * @see java.lang.Iterable#iterator()
331     */
332    @Override
333        public java.util.Iterator<I> iterator() {
334                return new Iterator();
335        }
336
337    /*
338     * (non-Javadoc)
339     * @see net.automatalib.commons.util.array.ArrayWritable#writeToArray(int, java.lang.Object[], int, int)
340     */
341    @Override
342        public void writeToArray(int offset, Object[] array, int tgtOffset, int length) {
343        int idx = offset, arrayIdx = tgtOffset;
344        
345        while(length-- > 0)
346                array[arrayIdx++] = getSymbol(idx++);
347    }
348    
349    
350    /**
351     * Retrieves a {@link List} view on the contents of this word.
352     * @return an unmodifiable list of the contained symbols.
353     */
354    public List<I> asList() {
355        return new AsList();
356    }
357    
358    /**
359         * Retrieves a prefix of the given length. If <code>length</code>
360         * is negative, then a prefix consisting of all but the last
361         * <code>-length</code> symbols is returned.
362         * 
363         * @param prefixLen the length of the prefix (may be negative, see above).
364         * @return the prefix of the given length.
365         */
366        public final Word<I> prefix(int prefixLen) {
367                if(prefixLen < 0)
368                        prefixLen = length() + prefixLen;
369                
370                return subWord(0, prefixLen);
371        }
372        
373        /**
374         * Retrieves a suffix of the given length. If <code>length</code> is
375         * negative, then a suffix consisting of all but the first
376         * <code>-length</code> symbols is returned.
377         * 
378         * @param suffixLen the length of the suffix (may be negative, see above).
379         * @return the suffix of the given length.
380         */
381        public final Word<I> suffix(int suffixLen) {
382                int wordLen = length();
383                int startIdx = (suffixLen < 0) ? -suffixLen : (wordLen - suffixLen);
384                
385                return subWord(startIdx, wordLen);
386        }
387        
388        /**
389         * Retrieves the list of all prefixes of this word. In the default implementation,
390         * the prefixes are lazily instantiated upon the respective calls of {@link List#get(int)}
391         * or {@link Iterator#next()}.
392         * @param longestFirst whether to start with the longest prefix (otherwise, the first prefix
393         * in the list will be the shortest).
394         * @return a (non-materialized) list containing all prefixes 
395         */
396        public List<Word<I>> prefixes(boolean longestFirst) {
397                return new SubwordList<>(this, true, longestFirst);
398        }
399        
400        /**
401         * Retrieves the list of all suffixes of this word. In the default implementation,
402         * the suffixes are lazily instantiated upon the respective calls of {@link List#get(int)}
403         * or {@link Iterator#next()}.
404         * @param longestFirst whether to start with the longest suffix (otherwise, the first suffix
405         * in the list will be the shortest).
406         * @return a (non-materialized) list containing all suffix 
407         */
408        public List<Word<I>> suffixes(boolean longestFirst) {
409                return new SubwordList<>(this, false, longestFirst);
410        }
411        
412        
413        /**
414         * Retrieves the next word after this in canonical order. Figuratively
415         * speaking, if there are <tt>k</tt> alphabet symbols, one can think of
416         * a word of length <tt>n</tt> as an <tt>n</tt>-digit radix-<tt>k</tt>
417         * representation of the number. The next word in canonical order
418         * is the representation for the number represented by this word plus one.
419         * @param sigma the alphabet
420         * @return the next word in canonical order
421         */
422        public Word<I> canonicalNext(Alphabet<I> sigma) {
423                int len = length();
424                Object[] symbols = new Object[len];
425                writeToArray(0, symbols, 0, len);
426                
427                int alphabetSize = sigma.size();
428                
429                int i = 0;
430                boolean overflow = true;
431                for(I sym : this) {
432                        int nextIdx = (sigma.getSymbolIndex(sym) + 1) % alphabetSize; 
433                        symbols[i++] = sigma.getSymbol(nextIdx);
434                        if(nextIdx != 0) {
435                                overflow = false;
436                                break;
437                        }
438                }
439                
440                while(i < len) {
441                        symbols[i] = getSymbol(i);
442                        i++;
443                }
444                
445                if(overflow) {
446                        Object[] newSymbols = new Object[len+1];
447                        newSymbols[0] = sigma.getSymbol(0);
448                        System.arraycopy(symbols, 0, newSymbols, 1, len);
449                        symbols = newSymbols;
450                }
451                
452                return new SharedWord<>(symbols);
453        }
454        
455        /**
456         * Retrieves the last symbol of this word.
457         * @return the last symbol of this word.
458         */
459        public I lastSymbol() {
460                return getSymbol(length() - 1);
461        }
462        
463        /**
464         * Retrieves the first symbol of this word.
465         * @return the first symbol of this word
466         */
467        public I firstSymbol() {
468                return getSymbol(0);
469        }
470        
471        /**
472         * Appends a symbol to this word and returns the result as a new word.
473         * @param symbol the symbol to append
474         * @return the word plus the given symbol
475         */
476        public Word<I> append(I symbol) {
477                int len = length();
478                Object[] array = new Object[len + 1];
479                writeToArray(0, array, 0, len);
480                array[len] = symbol;
481                return new SharedWord<>(array);
482        }
483        
484        /**
485         * Prepends a symbol to this word and returns the result as a new word.
486         * @param symbol the symbol to prepend
487         * @return the given symbol plus to word.
488         */
489        public Word<I> prepend(I symbol) {
490                int len = length();
491                Object[] array = new Object[len+1];
492                array[0] = symbol;
493                writeToArray(0, array, 1, len);
494                
495                return new SharedWord<>(array);
496        }
497        
498        /**
499         * Concatenates this word with several other words and returns the result
500         * as a new word.
501         * 
502         * Note that this method cannot be overridden. Implementing classes need to
503         * override the {@link #_concat(Word...)} method instead.
504         *  
505         * @param words the words to concatenate with this word
506         * @return the result of the concatenation
507         * @see #_concat(Word...)
508         */
509        @SafeVarargs
510        public final Word<I> concat(Word<I> ...words) {
511                return _concat(words);
512        }
513        
514        /**
515         * Realizes the concatenation of this word with several other words.
516         * @param words the words to concatenate
517         * @return the results of the concatenation
518         */
519        @SuppressWarnings("unchecked")
520        protected Word<I> _concat(Word<I> ...words) {
521                if(words.length == 0)
522                        return this;
523                
524                int len = length();
525                
526                int totalSize = len;
527                for(int i = 0; i < words.length; i++)
528                        totalSize += words[i].length();
529                
530                Object[] array = new Object[totalSize];
531                writeToArray(0, array, 0, len);
532                int currOfs = len;
533                for(int i = 0; i < words.length; i++) {
534                        Word<I> w = words[i];
535                        int wLen = w.length();
536                        w.writeToArray(0, array, currOfs, wLen);
537                        currOfs += wLen;
538                }
539
540                return new SharedWord<>(array);
541        }
542        
543        /**
544         * Checks if this word is a prefix of another word.
545         * @param other the other word
546         * @return <tt>true</tt> if this word is a prefix of the other word, <tt>false</tt>
547         * otherwise.
548         */
549        public boolean isPrefixOf(Word<I> other) {
550                int len = length(), otherLen = other.length();
551                if(otherLen < len)
552                        return false;
553                
554                for(int i = 0; i < len; i++) {
555                        I sym1 = getSymbol(i), sym2 = other.getSymbol(i);
556                        
557                        if(!Objects.equals(sym1,  sym2))
558                                return false;
559                }
560                return true;
561        }
562        
563        /**
564         * Determines the longest common prefix of this word and another
565         * word.
566         * @param other the other word
567         * @return the longest common prefix of this word and the other word
568         */
569        public Word<I> longestCommonPrefix(Word<I> other) {
570                int len = length(), otherLen = other.length();
571                int maxIdx = (len < otherLen) ? len : otherLen;
572                
573                int i = 0;
574                while(i < maxIdx) {
575                        I sym1 = getSymbol(i), sym2 = getSymbol(i);
576                        
577                        if(!Objects.equals(sym1, sym2))
578                                break;
579                        i++;
580                }
581                
582                return prefix(i);
583        }
584        
585        /**
586         * Checks if this word is a suffix of another word
587         * @param other the other word
588         * @return <tt>true</tt> if this word is a suffix of the other word, <tt>false</tt>
589         * otherwise. 
590         */
591        public boolean isSuffixOf(Word<I> other) {
592                int len = length(), otherLen = other.length();
593                if(otherLen < len)
594                        return false;
595                
596                int ofs = otherLen - len;
597                for(int i = 0; i < len; i++) {
598                        I sym1 = getSymbol(i), sym2 = other.getSymbol(ofs + i);
599                        if(!Objects.equals(sym1, sym2))
600                                return false;
601                }
602                return true;
603        }
604        
605        /**
606         * Determines the longest common suffix of this word and another word.
607         * @param other the other word
608         * @return the longest common suffix
609         */
610        public Word<I> longestCommonSuffix(Word<I> other) {
611                int len = length(), otherLen = other.length();
612                int minLen = (len < otherLen) ? len : otherLen;
613                
614                int idx1 = len, idx2 = otherLen;
615                int i = 0;
616                while(i < minLen) {
617                        I sym1 = getSymbol(--idx1), sym2 = other.getSymbol(--idx2);
618                        
619                        if(!Objects.equals(sym1, sym2))
620                                break;
621                        
622                        i++;
623                }
624                
625                return suffix(i);
626        }
627        
628        /**
629         * Retrieves a "flattened" version of this word, i.e., without any hierarchical structure
630         * attached.
631         * This can be helpful if {@link Word} is subclassed to allow representing, e.g., a concatenation
632         * dynamically, but due to performance concerns not too many levels of indirection
633         * should be introduced.
634         * @return a flattened version of this word.
635         */
636        public Word<I> flatten() {
637                int len = length();
638                Object[] array = new Object[len];
639                writeToArray(0, array, 0, len);
640                return new SharedWord<>(array);
641        }
642        
643        public Word<I> trimmed() {
644                int len = length();
645                Object[] array = new Object[len];
646                writeToArray(0, array, 0, len);
647                return new SharedWord<>(array);
648        }
649
650        /**
651         * Checks if this word is empty, i.e., contains no symbols.
652         * @return <tt>true</tt> if this word is empty, <tt>false</tt> otherwise.
653         */
654        public boolean isEmpty() {
655                return (length() == 0);
656        }
657}