001/* Copyright (C) 2013 TU Dortmund
002 * This file is part of AutomataLib, http://www.automatalib.net/.
003 * 
004 * AutomataLib is free software; you can redistribute it and/or
005 * modify it under the terms of the GNU Lesser General Public
006 * License version 3.0 as published by the Free Software Foundation.
007 * 
008 * AutomataLib is distributed in the hope that it will be useful,
009 * but WITHOUT ANY WARRANTY; without even the implied warranty of
010 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
011 * Lesser General Public License for more details.
012 * 
013 * You should have received a copy of the GNU Lesser General Public
014 * License along with AutomataLib; if not, see
015 * http://www.gnu.de/documents/lgpl.en.html.
016 */
017package net.automatalib.words;
018
019import java.io.IOException;
020import java.util.AbstractList;
021import java.util.Collections;
022import java.util.List;
023import java.util.Objects;
024
025import net.automatalib.commons.util.array.ArrayWritable;
026import net.automatalib.commons.util.strings.AbstractPrintable;
027import net.automatalib.commons.util.strings.StringUtil;
028
029/**
030 * A word is an ordered sequence of symbols. {@link Word}s are generally immutable,
031 * i.e., a single {@link Word} object will never change (unless symbol objects are modified,
032 * which is however highly discouraged).
033 * <p>
034 * This class provides the following static methods for creating words in the most common scenarios:
035 * <ul>
036 * <li> {@link #epsilon()} returns the empty word of length 0
037 * <li> {@link #fromLetter(Object)} turns a single letter into a word of length 1
038 * <li> {@link #fromSymbols(Object...)} creates a word from an array of symbols
039 * <li> {@link #fromArray(Object[], int, int)} creates a word from a subrange of a symbols array
040 * <li> {@link #fromList(List)} creates a word from a {@link List} of symbols
041 * </ul>
042 * <p>
043 * Modification operations like {@link #append(Object)} or {@link #concat(Word...)} create
044 * new objects, subsequently invoking these operations on the respective objects returned is
045 * therefore highly inefficient. If words need to be dynamically created, a {@link WordBuilder}
046 * should be used.
047 * <p>
048 * This is an abstract base class for word representations. Implementing classes only need to implement
049 * <ul>
050 * <li> {@link #getSymbol(int)}
051 * <li> {@link #length()}
052 * </ul>
053 * <p>
054 * However, for the sake of efficiency it is highly encouraged to overwrite the other methods
055 * as well, providing specialized realizations.
056 *
057 * @param <I> symbol class
058 * 
059 * @author Malte Isberner <malte.isberner@gmail.com>
060 */
061public abstract class Word<I> extends AbstractPrintable implements ArrayWritable<I>, Iterable<I> {
062        
063        
064        /**
065         * Retrieves the empty word.
066         * @return the empty word.
067         * @see Collections#emptyList()
068         */
069        @SuppressWarnings("unchecked")
070        public static <I> Word<I> epsilon() {
071                return (Word<I>)(Word<?>)EmptyWord.INSTANCE;
072        }
073        
074        /**
075         * Constructs a word from a single letter.
076         * @param letter the letter
077         * @return a word consisting of only this letter
078         */
079        public static <I> Word<I> fromLetter(I letter) {
080                return new LetterWord<>(letter);
081        }
082        
083        /**
084         * Creates a word from an array of symbols.
085         * @param symbols the symbol array
086         * @return a word containing the symbols in the specified array
087         */
088        @SafeVarargs
089        public static <I> Word<I> fromSymbols(I ...symbols) {
090                if(symbols.length == 0)
091                        return epsilon();
092                if(symbols.length == 1)
093                        return fromLetter(symbols[0]);
094                Object[] array = new Object[symbols.length];
095                System.arraycopy(symbols, 0, array, 0, symbols.length);
096                return new SharedWord<>(symbols);
097        }
098        
099        /**
100         * Creates a word from a subrange of an array of symbols. Note that to
101         * ensure immutability, internally a copy of the array is made.
102         * @param symbols the symbols array
103         * @param offset the starting index in the array
104         * @param length the length of the resulting word (from the starting index on)
105         * @return the word consisting of the symbols in the range
106         */
107        public static <I> Word<I> fromArray(I[] symbols, int offset, int length) {
108                if(length == 0)
109                        return epsilon();
110                if(length == 1)
111                        return fromLetter(symbols[offset]);
112                Object[] array = new Object[length];
113                System.arraycopy(symbols, offset, array, 0, length);
114                return new SharedWord<>(symbols);
115        }
116        
117        /**
118         * Creates a word from a list of symbols
119         * @param symbolList the list of symbols
120         * @return the resulting word
121         */
122        public static <I> Word<I> fromList(List<? extends I> symbolList) {
123                int siz = symbolList.size();
124                if(siz == 0)
125                        return epsilon();
126                if(siz == 1)
127                        return Word.<I>fromLetter(symbolList.get(0));
128                return new SharedWord<>(symbolList);
129        }
130        
131        public static Word<Character> fromString(String str) {
132                int len = str.length();
133                Character[] chars = new Character[str.length()];
134                for(int i = 0; i < len; i++)
135                        chars[i] = Character.valueOf(str.charAt(i));
136                return new SharedWord<>(chars);
137        }
138        
139        /*
140         * General word iterator
141         */
142        private class Iterator implements java.util.Iterator<I> {
143
144                private int index = 0;
145                
146                /*
147                 * (non-Javadoc)
148                 * @see java.util.Iterator#hasNext()
149                 */
150                @Override
151                public boolean hasNext() {
152                        return (index < Word.this.length());
153                }
154
155                /*
156                 * (non-Javadoc)
157                 * @see java.util.Iterator#next()
158                 */
159                @Override
160                public I next() {
161                        return Word.this.getSymbol(index++);
162                }
163                
164                /*
165                 * (non-Javadoc)
166                 * @see java.util.Iterator#remove()
167                 */
168                @Override
169                public void remove() {
170                        throw new UnsupportedOperationException();
171                }
172        }
173        
174        /*
175         * Representing a word as a list.
176         */
177        private class AsList extends AbstractList<I> {
178                
179                /*
180                 * (non-Javadoc)
181                 * @see java.util.AbstractList#get(int)
182                 */
183                @Override
184                public I get(int index) {
185                        return Word.this.getSymbol(index);
186                }
187
188                /*
189                 * (non-Javadoc)
190                 * @see java.util.AbstractCollection#size()
191                 */
192                @Override
193                public int size() {
194                        return Word.this.length();
195                }
196                
197                /*
198                 * (non-Javadoc)
199                 * @see java.util.AbstractList#iterator()
200                 */
201                @Override
202                public java.util.Iterator<I> iterator() {
203                        return Word.this.iterator();
204                }
205        }
206
207    /**
208     * Return symbol that is at the specified position
209     * @param i the position
210     * @return symbol at position i, <tt>null</tt> if no such symbol exists
211     */
212    public abstract I getSymbol(int index);
213    
214    /**
215     * Retrieves the length of this word.
216     * @return the length of this word.
217     */
218    public abstract int length();
219    
220    /*
221     * (non-Javadoc)
222     * @see java.lang.Object#hashCode()
223     */
224    @Override
225    public int hashCode() {
226        int hash = 5;
227        int len = length();
228        for(int i = 0; i < len; i++) {
229                hash *= 89;
230                I sym = getSymbol(i);
231                hash += (sym != null) ? sym.hashCode() : 0;
232        }
233        return hash;
234    }
235    
236    /*
237     * (non-Javadoc)
238     * @see java.lang.Object#equals(java.lang.Object)
239     */
240    @Override
241    public boolean equals(Object other) {
242        if(this == other)
243                return true;
244        if(other == null)
245                return false;
246        if(!(other instanceof Word))
247                return false;
248        Word<?> otherWord = (Word<?>)other;
249        int len = otherWord.length();
250        if(len != length())
251                return false;
252        for(int i = 0; i < len; i++) {
253                if(!Objects.equals(getSymbol(i), otherWord.getSymbol(i)))
254                        return false;
255        }
256        return true;
257    }
258    
259    /*
260     * (non-Javadoc)
261     * @see net.automatalib.commons.util.strings.Printable#print(java.lang.Appendable)
262     */
263    @Override
264        public void print(Appendable a) throws IOException {
265                StringUtil.appendIterable(a, this, " ");
266        }
267
268        /*
269     * (non-Javadoc)
270     * @see net.automatalib.commons.util.array.ArrayWritable#size()
271     */
272    @Override
273    public final int size() {
274        return length();
275    }
276    
277    
278    
279    /**
280     * Retrieves a word representing the specified subrange of this word.
281     * As words are immutable, this function usually can be realized quite efficient
282     * (implementing classes should take care of this).
283     * 
284     * @param fromIndex the first index, inclusive.
285     * @param toIndex the last index, exclusive.
286     * @return the word representing the specified subrange.
287     */
288    public final Word<I> subWord(int fromIndex, int toIndex) {
289        if(fromIndex < 0 || toIndex < fromIndex || toIndex > length())
290                throw new IndexOutOfBoundsException("Invalid subword range [" + fromIndex + ", " + toIndex + ")");
291        
292        return _subWord(fromIndex, toIndex);
293    }
294   
295    /**
296     * Retrieves the subword of this word starting at the given index and extending
297     * until the end of this word. Calling this method is equivalent to calling
298     * <pre>w.subWord(fromIndex, w.length())</pre>
299     * @param fromIndex the first index, inclusive
300     * @return the word representing the specified subrange
301     */
302    public final Word<I> subWord(int fromIndex) {
303        if(fromIndex <= 0) {
304                if(fromIndex == 0)
305                        return this;
306                throw new IndexOutOfBoundsException("Invalid subword range [" + fromIndex + ",)");
307        }
308        return _subWord(fromIndex, length());
309    }
310    
311    /**
312     * Internal subword operation implementation. In contrast to {@link #subWord(int, int)},
313     * no range checks need to be performed. As this method is flagged as <tt>protected</tt>,
314     * implementations may rely on the specified indices being valid.
315     * @param fromIndex the first index, inclusive (guaranteed to be valid)
316     * @param toIndex the last index, exclusive (guaranteed to be valid)
317     * @return the word representing the specified subrange
318     */
319    protected Word<I> _subWord(int fromIndex, int toIndex) {
320        int len = toIndex - fromIndex;
321        Object[] array = new Object[len];
322        writeToArray(fromIndex, array, 0, len);
323        return new SharedWord<>(array);
324    }
325    
326    /*
327     * (non-Javadoc)
328     * @see java.lang.Iterable#iterator()
329     */
330    @Override
331        public java.util.Iterator<I> iterator() {
332                return new Iterator();
333        }
334
335    /*
336     * (non-Javadoc)
337     * @see net.automatalib.commons.util.array.ArrayWritable#writeToArray(int, java.lang.Object[], int, int)
338     */
339    @Override
340        public void writeToArray(int offset, Object[] array, int tgtOffset, int length) {
341        int idx = offset, arrayIdx = tgtOffset;
342        
343        while(length-- > 0)
344                array[arrayIdx++] = getSymbol(idx++);
345    }
346    
347    
348    /**
349     * Retrieves a {@link List} view on the contents of this word.
350     * @return an unmodifiable list of the contained symbols.
351     */
352    public List<I> asList() {
353        return new AsList();
354    }
355    
356    /**
357         * Retrieves a prefix of the given length. If <code>length</code>
358         * is negative, then a prefix consisting of all but the last
359         * <code>-length</code> symbols is returned.
360         * 
361         * @param prefixLen the length of the prefix (may be negative, see above).
362         * @return the prefix of the given length.
363         */
364        public final Word<I> prefix(int prefixLen) {
365                if(prefixLen < 0)
366                        prefixLen = length() + prefixLen;
367                
368                return subWord(0, prefixLen);
369        }
370        
371        /**
372         * Retrieves a suffix of the given length. If <code>length</code> is
373         * negative, then a suffix consisting of all but the first
374         * <code>-length</code> symbols is returned.
375         * 
376         * @param suffixLen the length of the suffix (may be negative, see above).
377         * @return the suffix of the given length.
378         */
379        public final Word<I> suffix(int suffixLen) {
380                int wordLen = length();
381                int startIdx = (suffixLen < 0) ? -suffixLen : (wordLen - suffixLen);
382                
383                return subWord(startIdx, wordLen);
384        }
385        
386        /**
387         * Retrieves the list of all prefixes of this word. In the default implementation,
388         * the prefixes are lazily instantiated upon the respective calls of {@link List#get(int)}
389         * or {@link Iterator#next()}.
390         * @param longestFirst whether to start with the longest prefix (otherwise, the first prefix
391         * in the list will be the shortest).
392         * @return a (non-materialized) list containing all prefixes 
393         */
394        public List<Word<I>> prefixes(boolean longestFirst) {
395                return new SubwordList<>(this, true, longestFirst);
396        }
397        
398        /**
399         * Retrieves the list of all suffixes of this word. In the default implementation,
400         * the suffixes are lazily instantiated upon the respective calls of {@link List#get(int)}
401         * or {@link Iterator#next()}.
402         * @param longestFirst whether to start with the longest suffix (otherwise, the first suffix
403         * in the list will be the shortest).
404         * @return a (non-materialized) list containing all suffix 
405         */
406        public List<Word<I>> suffixes(boolean longestFirst) {
407                return new SubwordList<>(this, false, longestFirst);
408        }
409        
410        
411        /**
412         * Retrieves the next word after this in canonical order. Figuratively
413         * speaking, if there are <tt>k</tt> alphabet symbols, one can think of
414         * a word of length <tt>n</tt> as an <tt>n</tt>-digit radix-<tt>k</tt>
415         * representation of the number. The next word in canonical order
416         * is the representation for the number represented by this word plus one.
417         * @param sigma the alphabet
418         * @return the next word in canonical order
419         */
420        public Word<I> canonicalNext(Alphabet<I> sigma) {
421                int len = length();
422                Object[] symbols = new Object[len];
423                writeToArray(0, symbols, 0, len);
424                
425                int alphabetSize = sigma.size();
426                
427                int i = 0;
428                boolean overflow = true;
429                for(I sym : this) {
430                        int nextIdx = (sigma.getSymbolIndex(sym) + 1) % alphabetSize; 
431                        symbols[i++] = sigma.getSymbol(nextIdx);
432                        if(nextIdx != 0) {
433                                overflow = false;
434                                break;
435                        }
436                }
437                
438                while(i < len) {
439                        symbols[i] = getSymbol(i);
440                        i++;
441                }
442                
443                if(overflow) {
444                        Object[] newSymbols = new Object[len+1];
445                        newSymbols[0] = sigma.getSymbol(0);
446                        System.arraycopy(symbols, 0, newSymbols, 1, len);
447                        symbols = newSymbols;
448                }
449                
450                return new SharedWord<>(symbols);
451        }
452        
453        /**
454         * Retrieves the last symbol of this word.
455         * @return the last symbol of this word.
456         */
457        public I lastSymbol() {
458                return getSymbol(length() - 1);
459        }
460        
461        /**
462         * Retrieves the first symbol of this word.
463         * @return the first symbol of this word
464         */
465        public I firstSymbol() {
466                return getSymbol(0);
467        }
468        
469        /**
470         * Appends a symbol to this word and returns the result as a new word.
471         * @param symbol the symbol to append
472         * @return the word plus the given symbol
473         */
474        public Word<I> append(I symbol) {
475                int len = length();
476                Object[] array = new Object[len + 1];
477                writeToArray(0, array, 0, len);
478                array[len] = symbol;
479                return new SharedWord<>(array);
480        }
481        
482        /**
483         * Prepends a symbol to this word and returns the result as a new word.
484         * @param symbol the symbol to prepend
485         * @return the given symbol plus to word.
486         */
487        public Word<I> prepend(I symbol) {
488                int len = length();
489                Object[] array = new Object[len+1];
490                array[0] = symbol;
491                writeToArray(0, array, 1, len);
492                
493                return new SharedWord<>(array);
494        }
495        
496        /**
497         * Concatenates this word with several other words and returns the result
498         * as a new word.
499         * 
500         * Note that this method cannot be overridden. Implementing classes need to
501         * override the {@link #_concat(Word...)} method instead.
502         *  
503         * @param words the words to concatenate with this word
504         * @return the result of the concatenation
505         * @see #_concat(Word...)
506         */
507        @SafeVarargs
508        public final Word<I> concat(Word<I> ...words) {
509                return _concat(words);
510        }
511        
512        /**
513         * Realizes the concatenation of this word with several other words.
514         * @param words the words to concatenate
515         * @return the results of the concatenation
516         */
517        @SuppressWarnings("unchecked")
518        protected Word<I> _concat(Word<I> ...words) {
519                if(words.length == 0)
520                        return this;
521                
522                int len = length();
523                
524                int totalSize = len;
525                for(int i = 0; i < words.length; i++)
526                        totalSize += words[i].length();
527                
528                Object[] array = new Object[totalSize];
529                writeToArray(0, array, 0, len);
530                int currOfs = len;
531                for(int i = 0; i < words.length; i++) {
532                        Word<I> w = words[i];
533                        int wLen = w.length();
534                        w.writeToArray(0, array, currOfs, wLen);
535                        currOfs += wLen;
536                }
537
538                return new SharedWord<>(array);
539        }
540        
541        /**
542         * Checks if this word is a prefix of another word.
543         * @param other the other word
544         * @return <tt>true</tt> if this word is a prefix of the other word, <tt>false</tt>
545         * otherwise.
546         */
547        public boolean isPrefixOf(Word<I> other) {
548                int len = length(), otherLen = other.length();
549                if(otherLen < len)
550                        return false;
551                
552                for(int i = 0; i < len; i++) {
553                        I sym1 = getSymbol(i), sym2 = other.getSymbol(i);
554                        
555                        if(!Objects.equals(sym1,  sym2))
556                                return false;
557                }
558                return true;
559        }
560        
561        /**
562         * Determines the longest common prefix of this word and another
563         * word.
564         * @param other the other word
565         * @return the longest common prefix of this word and the other word
566         */
567        public Word<I> longestCommonPrefix(Word<I> other) {
568                int len = length(), otherLen = other.length();
569                int maxIdx = (len < otherLen) ? len : otherLen;
570                
571                int i = 0;
572                while(i < maxIdx) {
573                        I sym1 = getSymbol(i), sym2 = getSymbol(i);
574                        
575                        if(!Objects.equals(sym1, sym2))
576                                break;
577                        i++;
578                }
579                
580                return prefix(i);
581        }
582        
583        /**
584         * Checks if this word is a suffix of another word
585         * @param other the other word
586         * @return <tt>true</tt> if this word is a suffix of the other word, <tt>false</tt>
587         * otherwise. 
588         */
589        public boolean isSuffixOf(Word<I> other) {
590                int len = length(), otherLen = other.length();
591                if(otherLen < len)
592                        return false;
593                
594                int ofs = otherLen - len;
595                for(int i = 0; i < len; i++) {
596                        I sym1 = getSymbol(i), sym2 = other.getSymbol(ofs + i);
597                        if(!Objects.equals(sym1, sym2))
598                                return false;
599                }
600                return true;
601        }
602        
603        /**
604         * Determines the longest common suffix of this word and another word.
605         * @param other the other word
606         * @return the longest common suffix
607         */
608        public Word<I> longestCommonSuffix(Word<I> other) {
609                int len = length(), otherLen = other.length();
610                int minLen = (len < otherLen) ? len : otherLen;
611                
612                int idx1 = len, idx2 = otherLen;
613                int i = 0;
614                while(i < minLen) {
615                        I sym1 = getSymbol(--idx1), sym2 = other.getSymbol(--idx2);
616                        
617                        if(!Objects.equals(sym1, sym2))
618                                break;
619                        
620                        i++;
621                }
622                
623                return suffix(i);
624        }
625        
626        /**
627         * Retrieves a "flattened" version of this word, i.e., without any hierarchical structure
628         * attached.
629         * This can be helpful if {@link Word} is subclassed to allow representing, e.g., a concatenation
630         * dynamically, but due to performance concerns not too many levels of indirection
631         * should be introduced.
632         * @return a flattened version of this word.
633         */
634        public Word<I> flatten() {
635                int len = length();
636                Object[] array = new Object[len];
637                writeToArray(0, array, 0, len);
638                return new SharedWord<>(array);
639        }
640        
641        public Word<I> trimmed() {
642                int len = length();
643                Object[] array = new Object[len];
644                writeToArray(0, array, 0, len);
645                return new SharedWord<>(array);
646        }
647
648        /**
649         * Checks if this word is empty, i.e., contains no symbols.
650         * @return <tt>true</tt> if this word is empty, <tt>false</tt> otherwise.
651         */
652        public boolean isEmpty() {
653                return (length() == 0);
654        }
655}