ICU-22979 Support inverse rule for [] span in RBNF

unicode-org · Jan 7, 2025 · a8e7728 · a8e7728
1 parent a8d9f47
commit a8e7728
Show file tree

Hide file tree

Showing 11 changed files with 706 additions and 483 deletions.
diff --git a/icu4c/source/i18n/nfrs.cpp b/icu4c/source/i18n/nfrs.cpp
@@ -152,7 +152,7 @@ NFRuleSet::NFRuleSet(RuleBasedNumberFormat *_owner, UnicodeString* descriptions,
 
     UnicodeString& description = descriptions[index]; // !!! make sure index is valid
 
-    if (description.length() == 0) {
+    if (description.isEmpty()) {
         // throw new IllegalArgumentException("Empty rule set description");
         status = U_PARSE_ERROR;
         return;
@@ -177,16 +177,16 @@ NFRuleSet::NFRuleSet(RuleBasedNumberFormat *_owner, UnicodeString* descriptions,
         name.setTo(UNICODE_STRING_SIMPLE("%default"));
     }
 
-    if (description.length() == 0) {
+    if (description.isEmpty()) {
         // throw new IllegalArgumentException("Empty rule set description");
         status = U_PARSE_ERROR;
     }
 
     fIsPublic = name.indexOf(gPercentPercent, 2, 0) != 0;
 
-    if ( name.endsWith(gNoparse,8) ) {
+    if (name.endsWith(gNoparse, 8)) {
         fIsParseable = false;
-        name.truncate(name.length()-8); // remove the @noparse from the name
+        name.truncate(name.length() - 8); // remove the @noparse from the name
     }
 
     // all of the other members of NFRuleSet are initialized

diff --git a/icu4c/source/i18n/nfrule.cpp b/icu4c/source/i18n/nfrule.cpp
@@ -64,6 +64,7 @@ NFRule::~NFRule()
 
 static const char16_t gLeftBracket = 0x005b;
 static const char16_t gRightBracket = 0x005d;
+static const char16_t gVerticalLine = 0x007C;
 static const char16_t gColon = 0x003a;
 static const char16_t gZero = 0x0030;
 static const char16_t gNine = 0x0039;
@@ -146,6 +147,7 @@ NFRule::makeRules(UnicodeString& description,
         // then it's really shorthand for two rules (with one exception)
         LocalPointer<NFRule> rule2;
         UnicodeString sbuf;
+        int32_t orElseOp = description.indexOf(gVerticalLine);
 
         // we'll actually only split the rule into two rules if its
         // base value is an even multiple of its divisor (or it's one
@@ -193,9 +195,13 @@ NFRule::makeRules(UnicodeString& description,
             rule2->radix = rule1->radix;
             rule2->exponent = rule1->exponent;
 
-            // rule2's rule text omits the stuff in brackets: initialize
-            // its rule text and substitutions accordingly
+            // By default, rule2's rule text omits the stuff in brackets,
+            // unless it contains a | between the brackets.
+            // Initialize its rule text and substitutions accordingly.
             sbuf.append(description, 0, brack1);
+            if (orElseOp >= 0) {
+                sbuf.append(description, orElseOp + 1, brack2 - orElseOp - 1);
+            }
             if (brack2 + 1 < description.length()) {
                 sbuf.append(description, brack2 + 1, description.length() - brack2 - 1);
             }
@@ -206,7 +212,12 @@ NFRule::makeRules(UnicodeString& description,
         // the brackets themselves: initialize _its_ rule text and
         // substitutions accordingly
         sbuf.setTo(description, 0, brack1);
-        sbuf.append(description, brack1 + 1, brack2 - brack1 - 1);
+        if (orElseOp >= 0) {
+            sbuf.append(description, brack1 + 1, orElseOp - brack1 - 1);
+        }
+        else {
+            sbuf.append(description, brack1 + 1, brack2 - brack1 - 1);
+        }
         if (brack2 + 1 < description.length()) {
             sbuf.append(description, brack2 + 1, description.length() - brack2 - 1);
         }
@@ -404,7 +415,7 @@ NFRule::parseRuleDescriptor(UnicodeString& description, UErrorCode& status)
     // finally, if the rule body begins with an apostrophe, strip it off
     // (this is generally used to put whitespace at the beginning of
     // a rule's rule text)
-    if (description.length() > 0 && description.charAt(0) == gTick) {
+    if (!description.isEmpty() && description.charAt(0) == gTick) {
         description.removeBetween(0, 1);
     }
 

diff --git a/icu4c/source/i18n/rbnf.cpp b/icu4c/source/i18n/rbnf.cpp
@@ -1568,12 +1568,12 @@ RuleBasedNumberFormat::init(const UnicodeString& rules, LocalizationInfo* locali
 
     // divide up the descriptions into individual rule-set descriptions
     // and store them in a temporary array.  At each step, we also
-    // new up a rule set, but all this does is initialize its name
+    // create a rule set, but all this does is initialize its name
     // and remove it from its description.  We can't actually parse
     // the rest of the descriptions and finish initializing everything
     // because we have to know the names and locations of all the rule
     // sets before we can actually set everything up
-    if(!numRuleSets) {
+    if (!numRuleSets) {
         status = U_ILLEGAL_ARGUMENT_ERROR;
         return;
     }
@@ -1616,9 +1616,9 @@ RuleBasedNumberFormat::init(const UnicodeString& rules, LocalizationInfo* locali
     // last public rule set, no matter what the localization data says.
     initDefaultRuleSet();
 
-    // finally, we can go back through the temporary descriptions
-    // list and finish setting up the substructure (and we throw
-    // away the temporary descriptions as we go)
+    // Now that we know all the rule names, we can go back through
+    // the temporary descriptions list and finish setting up the substructure
+    // (and we throw away the temporary descriptions as we go)
     {
         for (int i = 0; i < numRuleSets; i++) {
             fRuleSets[i]->parseRules(ruleSetDescriptions[i], status);
@@ -1706,10 +1706,13 @@ RuleBasedNumberFormat::stripWhitespace(UnicodeString& description)
     UnicodeString result;
 
     int start = 0;
-    while (start != -1 && start < description.length()) {
-        // seek to the first non-whitespace character...
+    UChar ch;
+    while (start < description.length()) {
+        // Seek to the first non-whitespace character...
+        // If the first non-whitespace character is semicolon, skip it and continue
         while (start < description.length()
-            && PatternProps::isWhiteSpace(description.charAt(start))) {
+            && (PatternProps::isWhiteSpace(ch = description.charAt(start)) || ch == gSemiColon))
+        {
             ++start;
         }
 
@@ -1720,20 +1723,16 @@ RuleBasedNumberFormat::stripWhitespace(UnicodeString& description)
             // or if we don't find a semicolon, just copy the rest of
             // the string into the result
             result.append(description, start, description.length() - start);
-            start = -1;
+            break;
         }
         else if (p < description.length()) {
             result.append(description, start, p + 1 - start);
             start = p + 1;
         }
-
-        // when we get here, we've seeked off the end of the string, and
+        // when we get here from the else, we've seeked off the end of the string, and
         // we terminate the loop (we continue until *start* is -1 rather
         // than until *p* is -1, because otherwise we'd miss the last
         // rule in the description)
-        else {
-            start = -1;
-        }
     }
 
     description.setTo(result);