001/* Copyright (C) 2013 TU Dortmund 002 * This file is part of AutomataLib, http://www.automatalib.net/. 003 * 004 * AutomataLib is free software; you can redistribute it and/or 005 * modify it under the terms of the GNU Lesser General Public 006 * License version 3.0 as published by the Free Software Foundation. 007 * 008 * AutomataLib is distributed in the hope that it will be useful, 009 * but WITHOUT ANY WARRANTY; without even the implied warranty of 010 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 011 * Lesser General Public License for more details. 012 * 013 * You should have received a copy of the GNU Lesser General Public 014 * License along with AutomataLib; if not, see 015 * http://www.gnu.de/documents/lgpl.en.html. 016 */ 017package net.automatalib.words; 018 019import java.io.IOException; 020import java.util.AbstractList; 021import java.util.Collections; 022import java.util.List; 023import java.util.Objects; 024 025import net.automatalib.commons.util.array.ArrayWritable; 026import net.automatalib.commons.util.strings.AbstractPrintable; 027import net.automatalib.commons.util.strings.StringUtil; 028 029/** 030 * A word is an ordered sequence of symbols. {@link Word}s are generally immutable, 031 * i.e., a single {@link Word} object will never change (unless symbol objects are modified, 032 * which is however highly discouraged). 033 * <p> 034 * This class provides the following static methods for creating words in the most common scenarios: 035 * <ul> 036 * <li> {@link #epsilon()} returns the empty word of length 0 037 * <li> {@link #fromLetter(Object)} turns a single letter into a word of length 1 038 * <li> {@link #fromSymbols(Object...)} creates a word from an array of symbols 039 * <li> {@link #fromArray(Object[], int, int)} creates a word from a subrange of a symbols array 040 * <li> {@link #fromList(List)} creates a word from a {@link List} of symbols 041 * </ul> 042 * <p> 043 * Modification operations like {@link #append(Object)} or {@link #concat(Word...)} create 044 * new objects, subsequently invoking these operations on the respective objects returned is 045 * therefore highly inefficient. If words need to be dynamically created, a {@link WordBuilder} 046 * should be used. 047 * <p> 048 * This is an abstract base class for word representations. Implementing classes only need to implement 049 * <ul> 050 * <li> {@link #getSymbol(int)} 051 * <li> {@link #length()} 052 * </ul> 053 * <p> 054 * However, for the sake of efficiency it is highly encouraged to overwrite the other methods 055 * as well, providing specialized realizations. 056 * 057 * @param <I> symbol class 058 * 059 * @author Malte Isberner <malte.isberner@gmail.com> 060 */ 061public abstract class Word<I> extends AbstractPrintable implements ArrayWritable<I>, Iterable<I> { 062 063 064 /** 065 * Retrieves the empty word. 066 * @return the empty word. 067 * @see Collections#emptyList() 068 */ 069 @SuppressWarnings("unchecked") 070 public static <I> Word<I> epsilon() { 071 return (Word<I>)(Word<?>)EmptyWord.INSTANCE; 072 } 073 074 /** 075 * Constructs a word from a single letter. 076 * @param letter the letter 077 * @return a word consisting of only this letter 078 */ 079 public static <I> Word<I> fromLetter(I letter) { 080 return new LetterWord<>(letter); 081 } 082 083 /** 084 * Creates a word from an array of symbols. 085 * @param symbols the symbol array 086 * @return a word containing the symbols in the specified array 087 */ 088 @SafeVarargs 089 public static <I> Word<I> fromSymbols(I ...symbols) { 090 if(symbols.length == 0) 091 return epsilon(); 092 if(symbols.length == 1) 093 return fromLetter(symbols[0]); 094 Object[] array = new Object[symbols.length]; 095 System.arraycopy(symbols, 0, array, 0, symbols.length); 096 return new SharedWord<>(symbols); 097 } 098 099 /** 100 * Creates a word from a subrange of an array of symbols. Note that to 101 * ensure immutability, internally a copy of the array is made. 102 * @param symbols the symbols array 103 * @param offset the starting index in the array 104 * @param length the length of the resulting word (from the starting index on) 105 * @return the word consisting of the symbols in the range 106 */ 107 public static <I> Word<I> fromArray(I[] symbols, int offset, int length) { 108 if(length == 0) 109 return epsilon(); 110 if(length == 1) 111 return fromLetter(symbols[offset]); 112 Object[] array = new Object[length]; 113 System.arraycopy(symbols, offset, array, 0, length); 114 return new SharedWord<>(symbols); 115 } 116 117 /** 118 * Creates a word from a list of symbols 119 * @param symbolList the list of symbols 120 * @return the resulting word 121 */ 122 public static <I> Word<I> fromList(List<? extends I> symbolList) { 123 int siz = symbolList.size(); 124 if(siz == 0) 125 return epsilon(); 126 if(siz == 1) 127 return Word.<I>fromLetter(symbolList.get(0)); 128 return new SharedWord<>(symbolList); 129 } 130 131 public static Word<Character> fromString(String str) { 132 int len = str.length(); 133 Character[] chars = new Character[str.length()]; 134 for(int i = 0; i < len; i++) 135 chars[i] = Character.valueOf(str.charAt(i)); 136 return new SharedWord<>(chars); 137 } 138 139 /* 140 * General word iterator 141 */ 142 private class Iterator implements java.util.Iterator<I> { 143 144 private int index = 0; 145 146 /* 147 * (non-Javadoc) 148 * @see java.util.Iterator#hasNext() 149 */ 150 @Override 151 public boolean hasNext() { 152 return (index < Word.this.length()); 153 } 154 155 /* 156 * (non-Javadoc) 157 * @see java.util.Iterator#next() 158 */ 159 @Override 160 public I next() { 161 return Word.this.getSymbol(index++); 162 } 163 164 /* 165 * (non-Javadoc) 166 * @see java.util.Iterator#remove() 167 */ 168 @Override 169 public void remove() { 170 throw new UnsupportedOperationException(); 171 } 172 } 173 174 /* 175 * Representing a word as a list. 176 */ 177 private class AsList extends AbstractList<I> { 178 179 /* 180 * (non-Javadoc) 181 * @see java.util.AbstractList#get(int) 182 */ 183 @Override 184 public I get(int index) { 185 return Word.this.getSymbol(index); 186 } 187 188 /* 189 * (non-Javadoc) 190 * @see java.util.AbstractCollection#size() 191 */ 192 @Override 193 public int size() { 194 return Word.this.length(); 195 } 196 197 /* 198 * (non-Javadoc) 199 * @see java.util.AbstractList#iterator() 200 */ 201 @Override 202 public java.util.Iterator<I> iterator() { 203 return Word.this.iterator(); 204 } 205 } 206 207 /** 208 * Return symbol that is at the specified position 209 * @param i the position 210 * @return symbol at position i, <tt>null</tt> if no such symbol exists 211 */ 212 public abstract I getSymbol(int index); 213 214 /** 215 * Retrieves the length of this word. 216 * @return the length of this word. 217 */ 218 public abstract int length(); 219 220 /* 221 * (non-Javadoc) 222 * @see java.lang.Object#hashCode() 223 */ 224 @Override 225 public int hashCode() { 226 int hash = 5; 227 int len = length(); 228 for(int i = 0; i < len; i++) { 229 hash *= 89; 230 I sym = getSymbol(i); 231 hash += (sym != null) ? sym.hashCode() : 0; 232 } 233 return hash; 234 } 235 236 /* 237 * (non-Javadoc) 238 * @see java.lang.Object#equals(java.lang.Object) 239 */ 240 @Override 241 public boolean equals(Object other) { 242 if(this == other) 243 return true; 244 if(other == null) 245 return false; 246 if(!(other instanceof Word)) 247 return false; 248 Word<?> otherWord = (Word<?>)other; 249 int len = otherWord.length(); 250 if(len != length()) 251 return false; 252 for(int i = 0; i < len; i++) { 253 if(!Objects.equals(getSymbol(i), otherWord.getSymbol(i))) 254 return false; 255 } 256 return true; 257 } 258 259 /* 260 * (non-Javadoc) 261 * @see net.automatalib.commons.util.strings.Printable#print(java.lang.Appendable) 262 */ 263 @Override 264 public void print(Appendable a) throws IOException { 265 StringUtil.appendIterable(a, this, " "); 266 } 267 268 /* 269 * (non-Javadoc) 270 * @see net.automatalib.commons.util.array.ArrayWritable#size() 271 */ 272 @Override 273 public final int size() { 274 return length(); 275 } 276 277 278 279 /** 280 * Retrieves a word representing the specified subrange of this word. 281 * As words are immutable, this function usually can be realized quite efficient 282 * (implementing classes should take care of this). 283 * 284 * @param fromIndex the first index, inclusive. 285 * @param toIndex the last index, exclusive. 286 * @return the word representing the specified subrange. 287 */ 288 public final Word<I> subWord(int fromIndex, int toIndex) { 289 if(fromIndex < 0 || toIndex < fromIndex || toIndex > length()) 290 throw new IndexOutOfBoundsException("Invalid subword range [" + fromIndex + ", " + toIndex + ")"); 291 292 return _subWord(fromIndex, toIndex); 293 } 294 295 /** 296 * Retrieves the subword of this word starting at the given index and extending 297 * until the end of this word. Calling this method is equivalent to calling 298 * <pre>w.subWord(fromIndex, w.length())</pre> 299 * @param fromIndex the first index, inclusive 300 * @return the word representing the specified subrange 301 */ 302 public final Word<I> subWord(int fromIndex) { 303 if(fromIndex <= 0) { 304 if(fromIndex == 0) 305 return this; 306 throw new IndexOutOfBoundsException("Invalid subword range [" + fromIndex + ",)"); 307 } 308 return _subWord(fromIndex, length()); 309 } 310 311 /** 312 * Internal subword operation implementation. In contrast to {@link #subWord(int, int)}, 313 * no range checks need to be performed. As this method is flagged as <tt>protected</tt>, 314 * implementations may rely on the specified indices being valid. 315 * @param fromIndex the first index, inclusive (guaranteed to be valid) 316 * @param toIndex the last index, exclusive (guaranteed to be valid) 317 * @return the word representing the specified subrange 318 */ 319 protected Word<I> _subWord(int fromIndex, int toIndex) { 320 int len = toIndex - fromIndex; 321 Object[] array = new Object[len]; 322 writeToArray(fromIndex, array, 0, len); 323 return new SharedWord<>(array); 324 } 325 326 /* 327 * (non-Javadoc) 328 * @see java.lang.Iterable#iterator() 329 */ 330 @Override 331 public java.util.Iterator<I> iterator() { 332 return new Iterator(); 333 } 334 335 /* 336 * (non-Javadoc) 337 * @see net.automatalib.commons.util.array.ArrayWritable#writeToArray(int, java.lang.Object[], int, int) 338 */ 339 @Override 340 public void writeToArray(int offset, Object[] array, int tgtOffset, int length) { 341 int idx = offset, arrayIdx = tgtOffset; 342 343 while(length-- > 0) 344 array[arrayIdx++] = getSymbol(idx++); 345 } 346 347 348 /** 349 * Retrieves a {@link List} view on the contents of this word. 350 * @return an unmodifiable list of the contained symbols. 351 */ 352 public List<I> asList() { 353 return new AsList(); 354 } 355 356 /** 357 * Retrieves a prefix of the given length. If <code>length</code> 358 * is negative, then a prefix consisting of all but the last 359 * <code>-length</code> symbols is returned. 360 * 361 * @param prefixLen the length of the prefix (may be negative, see above). 362 * @return the prefix of the given length. 363 */ 364 public final Word<I> prefix(int prefixLen) { 365 if(prefixLen < 0) 366 prefixLen = length() + prefixLen; 367 368 return subWord(0, prefixLen); 369 } 370 371 /** 372 * Retrieves a suffix of the given length. If <code>length</code> is 373 * negative, then a suffix consisting of all but the first 374 * <code>-length</code> symbols is returned. 375 * 376 * @param suffixLen the length of the suffix (may be negative, see above). 377 * @return the suffix of the given length. 378 */ 379 public final Word<I> suffix(int suffixLen) { 380 int wordLen = length(); 381 int startIdx = (suffixLen < 0) ? -suffixLen : (wordLen - suffixLen); 382 383 return subWord(startIdx, wordLen); 384 } 385 386 /** 387 * Retrieves the list of all prefixes of this word. In the default implementation, 388 * the prefixes are lazily instantiated upon the respective calls of {@link List#get(int)} 389 * or {@link Iterator#next()}. 390 * @param longestFirst whether to start with the longest prefix (otherwise, the first prefix 391 * in the list will be the shortest). 392 * @return a (non-materialized) list containing all prefixes 393 */ 394 public List<Word<I>> prefixes(boolean longestFirst) { 395 return new SubwordList<>(this, true, longestFirst); 396 } 397 398 /** 399 * Retrieves the list of all suffixes of this word. In the default implementation, 400 * the suffixes are lazily instantiated upon the respective calls of {@link List#get(int)} 401 * or {@link Iterator#next()}. 402 * @param longestFirst whether to start with the longest suffix (otherwise, the first suffix 403 * in the list will be the shortest). 404 * @return a (non-materialized) list containing all suffix 405 */ 406 public List<Word<I>> suffixes(boolean longestFirst) { 407 return new SubwordList<>(this, false, longestFirst); 408 } 409 410 411 /** 412 * Retrieves the next word after this in canonical order. Figuratively 413 * speaking, if there are <tt>k</tt> alphabet symbols, one can think of 414 * a word of length <tt>n</tt> as an <tt>n</tt>-digit radix-<tt>k</tt> 415 * representation of the number. The next word in canonical order 416 * is the representation for the number represented by this word plus one. 417 * @param sigma the alphabet 418 * @return the next word in canonical order 419 */ 420 public Word<I> canonicalNext(Alphabet<I> sigma) { 421 int len = length(); 422 Object[] symbols = new Object[len]; 423 writeToArray(0, symbols, 0, len); 424 425 int alphabetSize = sigma.size(); 426 427 int i = 0; 428 boolean overflow = true; 429 for(I sym : this) { 430 int nextIdx = (sigma.getSymbolIndex(sym) + 1) % alphabetSize; 431 symbols[i++] = sigma.getSymbol(nextIdx); 432 if(nextIdx != 0) { 433 overflow = false; 434 break; 435 } 436 } 437 438 while(i < len) { 439 symbols[i] = getSymbol(i); 440 i++; 441 } 442 443 if(overflow) { 444 Object[] newSymbols = new Object[len+1]; 445 newSymbols[0] = sigma.getSymbol(0); 446 System.arraycopy(symbols, 0, newSymbols, 1, len); 447 symbols = newSymbols; 448 } 449 450 return new SharedWord<>(symbols); 451 } 452 453 /** 454 * Retrieves the last symbol of this word. 455 * @return the last symbol of this word. 456 */ 457 public I lastSymbol() { 458 return getSymbol(length() - 1); 459 } 460 461 /** 462 * Retrieves the first symbol of this word. 463 * @return the first symbol of this word 464 */ 465 public I firstSymbol() { 466 return getSymbol(0); 467 } 468 469 /** 470 * Appends a symbol to this word and returns the result as a new word. 471 * @param symbol the symbol to append 472 * @return the word plus the given symbol 473 */ 474 public Word<I> append(I symbol) { 475 int len = length(); 476 Object[] array = new Object[len + 1]; 477 writeToArray(0, array, 0, len); 478 array[len] = symbol; 479 return new SharedWord<>(array); 480 } 481 482 /** 483 * Prepends a symbol to this word and returns the result as a new word. 484 * @param symbol the symbol to prepend 485 * @return the given symbol plus to word. 486 */ 487 public Word<I> prepend(I symbol) { 488 int len = length(); 489 Object[] array = new Object[len+1]; 490 array[0] = symbol; 491 writeToArray(0, array, 1, len); 492 493 return new SharedWord<>(array); 494 } 495 496 /** 497 * Concatenates this word with several other words and returns the result 498 * as a new word. 499 * 500 * Note that this method cannot be overridden. Implementing classes need to 501 * override the {@link #_concat(Word...)} method instead. 502 * 503 * @param words the words to concatenate with this word 504 * @return the result of the concatenation 505 * @see #_concat(Word...) 506 */ 507 @SafeVarargs 508 public final Word<I> concat(Word<I> ...words) { 509 return _concat(words); 510 } 511 512 /** 513 * Realizes the concatenation of this word with several other words. 514 * @param words the words to concatenate 515 * @return the results of the concatenation 516 */ 517 @SuppressWarnings("unchecked") 518 protected Word<I> _concat(Word<I> ...words) { 519 if(words.length == 0) 520 return this; 521 522 int len = length(); 523 524 int totalSize = len; 525 for(int i = 0; i < words.length; i++) 526 totalSize += words[i].length(); 527 528 Object[] array = new Object[totalSize]; 529 writeToArray(0, array, 0, len); 530 int currOfs = len; 531 for(int i = 0; i < words.length; i++) { 532 Word<I> w = words[i]; 533 int wLen = w.length(); 534 w.writeToArray(0, array, currOfs, wLen); 535 currOfs += wLen; 536 } 537 538 return new SharedWord<>(array); 539 } 540 541 /** 542 * Checks if this word is a prefix of another word. 543 * @param other the other word 544 * @return <tt>true</tt> if this word is a prefix of the other word, <tt>false</tt> 545 * otherwise. 546 */ 547 public boolean isPrefixOf(Word<I> other) { 548 int len = length(), otherLen = other.length(); 549 if(otherLen < len) 550 return false; 551 552 for(int i = 0; i < len; i++) { 553 I sym1 = getSymbol(i), sym2 = other.getSymbol(i); 554 555 if(!Objects.equals(sym1, sym2)) 556 return false; 557 } 558 return true; 559 } 560 561 /** 562 * Determines the longest common prefix of this word and another 563 * word. 564 * @param other the other word 565 * @return the longest common prefix of this word and the other word 566 */ 567 public Word<I> longestCommonPrefix(Word<I> other) { 568 int len = length(), otherLen = other.length(); 569 int maxIdx = (len < otherLen) ? len : otherLen; 570 571 int i = 0; 572 while(i < maxIdx) { 573 I sym1 = getSymbol(i), sym2 = getSymbol(i); 574 575 if(!Objects.equals(sym1, sym2)) 576 break; 577 i++; 578 } 579 580 return prefix(i); 581 } 582 583 /** 584 * Checks if this word is a suffix of another word 585 * @param other the other word 586 * @return <tt>true</tt> if this word is a suffix of the other word, <tt>false</tt> 587 * otherwise. 588 */ 589 public boolean isSuffixOf(Word<I> other) { 590 int len = length(), otherLen = other.length(); 591 if(otherLen < len) 592 return false; 593 594 int ofs = otherLen - len; 595 for(int i = 0; i < len; i++) { 596 I sym1 = getSymbol(i), sym2 = other.getSymbol(ofs + i); 597 if(!Objects.equals(sym1, sym2)) 598 return false; 599 } 600 return true; 601 } 602 603 /** 604 * Determines the longest common suffix of this word and another word. 605 * @param other the other word 606 * @return the longest common suffix 607 */ 608 public Word<I> longestCommonSuffix(Word<I> other) { 609 int len = length(), otherLen = other.length(); 610 int minLen = (len < otherLen) ? len : otherLen; 611 612 int idx1 = len, idx2 = otherLen; 613 int i = 0; 614 while(i < minLen) { 615 I sym1 = getSymbol(--idx1), sym2 = other.getSymbol(--idx2); 616 617 if(!Objects.equals(sym1, sym2)) 618 break; 619 620 i++; 621 } 622 623 return suffix(i); 624 } 625 626 /** 627 * Retrieves a "flattened" version of this word, i.e., without any hierarchical structure 628 * attached. 629 * This can be helpful if {@link Word} is subclassed to allow representing, e.g., a concatenation 630 * dynamically, but due to performance concerns not too many levels of indirection 631 * should be introduced. 632 * @return a flattened version of this word. 633 */ 634 public Word<I> flatten() { 635 int len = length(); 636 Object[] array = new Object[len]; 637 writeToArray(0, array, 0, len); 638 return new SharedWord<>(array); 639 } 640 641 public Word<I> trimmed() { 642 int len = length(); 643 Object[] array = new Object[len]; 644 writeToArray(0, array, 0, len); 645 return new SharedWord<>(array); 646 } 647 648 /** 649 * Checks if this word is empty, i.e., contains no symbols. 650 * @return <tt>true</tt> if this word is empty, <tt>false</tt> otherwise. 651 */ 652 public boolean isEmpty() { 653 return (length() == 0); 654 } 655}