Skip to content

Commit

Permalink
ICU-22845 Better iterations for the ICU4J UnicodeSet
Browse files Browse the repository at this point in the history
  • Loading branch information
mihnita committed Aug 2, 2024
1 parent 10fe2a6 commit 84e315c
Show file tree
Hide file tree
Showing 2 changed files with 378 additions and 7 deletions.
204 changes: 197 additions & 7 deletions icu4j/main/core/src/main/java/com/ibm/icu/text/UnicodeSet.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,12 @@
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.SortedSet;
import java.util.Spliterator;
import java.util.TreeSet;
import java.util.function.IntConsumer;
import java.util.stream.IntStream;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import com.ibm.icu.impl.BMPSet;
import com.ibm.icu.impl.CharacterPropertiesImpl;
Expand Down Expand Up @@ -278,14 +283,25 @@
* </tr>
* </table>
* </blockquote>
* <p>To iterate over contents of UnicodeSet, the following are available:
* <ul><li>{@link #ranges()} to iterate through the ranges</li>
* <li>{@link #strings()} to iterate through the strings</li>
* <li>{@link #iterator()} to iterate through the entire contents in a single loop.
* That method is, however, not particularly efficient, since it "boxes" each code point into a String.
*
* <p>To iterate over contents of {@code UnicodeSet}, the following are available:
* <ul>
* <li>to iterate through the ranges: {@link #ranges()}, {@link UnicodeSetIterator#nextRange()},
* {@link #rangeStream()}</li>
* <li>to iterate through the strings: {@link #strings()}, {@link UnicodeSetIterator#next()},
* {@link #stringStream()}</li>
* <li>to iterate through the code points: {@link UnicodeSetIterator#next()},
* {@link #codePoints()}, {@link #codePointStream()}</li>
* <li>to iterate through the entire contents in a single loop: {@link #iterator()},
* {@link UnicodeSetIterator#next()}, {@link #stream()}.<br>
* All of these method are, however, not particularly efficient,
* since they convert each individual code point to a {@code String}.
* </ul>
* All of the above can be used in <b>for</b> loops.
* The {@link com.ibm.icu.text.UnicodeSetIterator UnicodeSetIterator} can also be used, but not in <b>for</b> loops.
*
* <p>The iterators and streams methods work as expected in current idiomatic Java usage.<br>
* The {@link UnicodeSetIterator} cannot be in <b>for</b> loops, and it is not very Java-ideomatic, because it is old.
* But it might be faster in certain use cases. We recommend that you measure in performance sensitive code.<br>
*
* <p>To replace, count elements, or delete spans, see {@link com.ibm.icu.text.UnicodeSetSpanner UnicodeSetSpanner}.
*
* @author Alan Liu
Expand Down Expand Up @@ -5127,5 +5143,179 @@ public static void setDefaultXSymbolTable(XSymbolTable xSymbolTable) {
CharacterPropertiesImpl.clear();
XSYMBOL_TABLE = xSymbolTable;
}

/**
* Returns a {@link Stream} of {@link EntryRange} values from this {@code UnicodeSet}.
*
* <p><b>Warning:</b> The {@link EntryRange} instance is the same each time; the contents are just reset.
*
* <p><b>Warning:</b> To iterate over the full contents, you have to also iterate over the strings.
*
* <p><b>Warning:</b> For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
* Do not alter the {@code UnicodeSet} while iterating.
*
* @return a {@link Stream} of {@link EntryRange}
*/
public Stream<EntryRange> rangeStream() {
return StreamSupport.stream(ranges().spliterator(), false);
}

/**
* Returns a {@link Stream} of {@code String} values from this {@code UnicodeSet}.
*
* <p><b>Warning:</b> To iterate over the full contents, you have to also iterate over the strings.
*
* <p><b>Warning:</b> For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
* Do not alter the {@code UnicodeSet} while iterating.
*
* @return a {@link Stream} of {@code String}
*/
public Stream<String> stringStream() {
return StreamSupport.stream(strings().spliterator(), false);
}

/**
* Returns an {@link IntStream} of Unicode code point values from this {@code UnicodeSet}.
*
* <p><b>Warning:</b> To iterate over the full contents, you have to also iterate over the strings.
*
* <p><b>Warning:</b> For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
* Do not alter the {@code UnicodeSet} while iterating.
*
* @return an {@link IntStream} of Unicode code point values
*/
public IntStream codePointStream() {
return StreamSupport.intStream(new CodePointsSpliterator(this), false);
}

/**
* Returns a stream of {@code String} values from this {@code UnicodeSet}.
*
* <p><b>Warning:</b> To iterate over the full contents, you have to also iterate over the strings.
*
* <p><b>Warning:</b> For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
* Do not alter the {@code UnicodeSet} while iterating.
*
* @return a {@link Stream} of {@code String}
*/
public Stream<String> stream() {
return StreamSupport.stream(spliterator(), false);
}

/**
* Provide for iteration on code points.
*
* <p>Returns an {@link Iterable} over all the code points, "expanding" the ranges
* by iterating on all code points between the {@link EntryRange#codepoint} and
* {@link EntryRange#codepointEnd}.
*
* <p><b>Warning:</b> This is a convenience method, but comes with a performance penalty
* because it boxes {@code int} into {@code Integer}.<br>
* For faster iteration use the {@link #codePointStream()} or {@link #charAt(int)}.
*
* <p><b>Warning:</b> To iterate over the full contents, you have to also iterate over the strings.
*
* <p><b>Warning:</b> For speed, {@code UnicodeSet} iteration does not check for concurrent modification.<br>
* Do not alter the {@code UnicodeSet} while iterating.
*
* @return an {@link Iterable} over all the code points
*/
public Iterable<Integer> codePoints() {
return new CodePointsIterable();
}

private class CodePointsIterable implements Iterable<Integer> {
@Override
public Iterator<Integer> iterator() {
return new CodePointsIterator();
}
}

private class CodePointsIterator implements Iterator<Integer> {
private int pos = 0;
private int lastCp = -1; // don't call `charAt` twice, in `hasNext()` and `next()`

@Override
public boolean hasNext() {
lastCp = charAt(pos++);
return lastCp != -1;
}

@Override
public Integer next() {
if (lastCp != -1) {
return lastCp;
} else {
throw new NoSuchElementException();
}
}

@Override
public void remove() {
throw new UnsupportedOperationException();
}
}

private static class CodePointsSpliterator implements Spliterator.OfInt {
/*
* WARNING: although it is tempting to also use `Spliterator.SIZED`, it is not safe,
* as `UnicodeSet.size()` does not return the number of code points,
* it returns code points + strings
*
* Quote from the `UnicodeSet.size()` doc:
* > Returns the number of elements in this set (its cardinality)
* > Note than the elements of a set may include both individual
* > codepoints and strings."
*
* And from the Spliterator.SIZED doc:
* > Characteristic value signifying that the value returned from
* > {@code estimateSize()} prior to traversal or splitting represents a
* > finite size that, in the absence of structural source modification,
* > represents an exact count of the number of elements that would be
* > encountered by a complete traversal.
* Note "exact count of the number of elements that would be encountered"
*/
private final static int CHARACTERISTICS = Spliterator.ORDERED | Spliterator.DISTINCT | Spliterator.NONNULL;

private final UnicodeSet uset;
private int index;

CodePointsSpliterator(UnicodeSet uset) {
this.uset = uset;
this.index = 0;
}

@Override
public long estimateSize() {
return uset.size();
}

@Override
public int characteristics() {
// TODO: consider calling `isFrozen()` and also return `Spliterator.IMMUTABLE` if true?
return CHARACTERISTICS;
}

@Override
public Spliterator.OfInt trySplit() {
/* From the doc:
* > This method may return null for any reason, including emptiness, inability to split after
* > traversal has commenced, data structure constraints, and efficiency considerations.
*/
return null;
}

@Override
public boolean tryAdvance(IntConsumer action) {
if (action == null)
throw new NullPointerException();
int cp = uset.charAt(index++);
if (cp != -1) {
action.accept(cp);
return true;
}
return false;
}
}
}
//eof
Loading

0 comments on commit 84e315c

Please sign in to comment.