package com.xiaomi.ai.minmt.common;

import androidx.exifinterface.media.ExifInterface;
import com.xiaomi.ai.nlp.lm.util.Constant;
import com.xiaomi.onetrack.api.c;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: classes2.dex */
public class MosesTokenizer {
    private static List<String> NONBREAKING_PREFIXES = Arrays.asList(ExifInterface.GPS_MEASUREMENT_IN_PROGRESS, c.a, "C", "D", ExifInterface.LONGITUDE_EAST, "F", "G", c.b, "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", ExifInterface.LATITUDE_SOUTH, ExifInterface.GPS_DIRECTION_TRUE, "U", ExifInterface.GPS_MEASUREMENT_INTERRUPTED, ExifInterface.LONGITUDE_WEST, "X", "Y", "Z", "Adj", "Adm", "Adv", "Asst", "Bart", "Bldg", "Brig", "Bros", "Capt", "Cmdr", "Col", "Comdr", "Con", "Corp", "Cpl", "DR", "Dr", "Drs", "Ens", "Gen", "Gov", "Hon", "Hr", "Hosp", "Insp", "Lt", "MM", "MR", "MRS", "MS", "Maj", "Messrs", "Mlle", "Mme", "Mr", "Mrs", "Ms", "Msgr", "Op", "Ord", "Pfc", "Ph", "Prof", "Pvt", "Rep", "Reps", "Res", "Rev", "Rt", "Sen", "Sens", "Sfc", "Sgt", "Sr", "St", "Spt", "Srg", "v", "vs", "i.e", "rev", "e.g", "Nos", "Nr", "Jan", "Feb", "Mar", "Apr", "Jn", "Jl", "Ag", "Sep", "Oct", "Nov", "Dec", "U.S", "U.N", "Gov", "Ph.D", "D.Phil", "p.m", "d.c", "a.m", "L.A", "Inc", "Gen", "VS", "Jr", "Co.", "U.K", "N.Y", "N.B.A", "F.B.I", "Sept", "Jan", "Jun", "Jul", "May", "E.U", "O.K", "Ltd", "Lt", "Sgt", "Rev", "Col", "N.J", "Capt", "N.C", "N.F.L", "Bros", "B.C", "Maj", "C.I.A", "G.M", "S.C", "N.C.A.A", "N.H.L", "J.J", "A.J", "O.J", "S.E.C", "R.I.P", "etc", "al");
    private static List<String> NUMERIC_ONLY_PREFIXES = Arrays.asList("No", "Art", "pp");
    private static String alpha = "A-Za-zäöüßÄÖÜẞ\\uAC00-\\uD7AF\\u0900-\\u0963\\u0965-\\u097f\\u0400-\\u04ff\\u0500-\\u052f\\u2de0-\\u2dff\\ua640-\\ua69f";
    private static Pattern isAlphaNum = Pattern.compile("([^0-9.\\s" + alpha + "'`,\\-])");

    private static boolean firstLetterIsCapital(String str) {
        return Character.isUpperCase(str.charAt(0));
    }

    private static String handlesNonbreakingPrefixes(String str) {
        int i;
        int i2;
        String[] split = str.split("\\s+");
        Pattern compile = Pattern.compile("^(\\S+)\\.$");
        for (int i3 = 0; i3 < split.length; i3++) {
            Matcher matcher = compile.matcher(split[i3]);
            if (matcher.find()) {
                String group = matcher.group(1);
                if ((NUMERIC_ONLY_PREFIXES.contains(group) || !NONBREAKING_PREFIXES.contains(group)) && ((!NUMERIC_ONLY_PREFIXES.contains(group) || (i2 = i3 + 1) >= split.length || !isNumeric(split[i2])) && ((i = i3 + 1) >= split.length || !isAlpha(split[i]) || firstLetterIsCapital(split[i])))) {
                    split[i3] = group + " .";
                }
            }
        }
        return Utils.join(Constant.BLANK, split);
    }

    private static boolean isAlpha(String str) {
        return str.matches("[a-zA-Z.]+");
    }

    private static boolean isNumeric(String str) {
        return str.matches("[0-9,.]+");
    }

    private static String replaceMultiDots(String str) {
        String replaceAll = str.replaceAll("\\.(\\.+)", " DOTMULTI$1");
        while (replaceAll.contains("DOTMULTI.")) {
            replaceAll = replaceAll.replaceAll("DOTMULTI\\.([^.])", "DOTDOTMULTI $1").replaceAll("DOTMULTI\\.", "DOTDOTMULTI");
        }
        return replaceAll;
    }

    public List<String> tokenize(String str, boolean z) {
        String replaceAll;
        if ("".equals(str.trim())) {
            return Collections.EMPTY_LIST;
        }
        String trim = handlesNonbreakingPrefixes(replaceMultiDots(isAlphaNum.matcher(str.replaceAll("\\s+", Constant.BLANK).replaceAll("[\u0000-\u001f]", "").trim()).replaceAll(" $1 ")).replaceAll("([^0-9])[,]", "$1 , ").replaceAll("[,]([^0-9])", " , $1").replaceAll("([^A-Za-z])[']([^A-Za-z])", "$1 ' $2").replaceAll("([^A-Za-z0-9])[']([A-Za-z])", "$1 ' $2").replaceAll("([a-zA-Z])[']([^a-zA-Z])", "$1 ' $2").replaceAll("([a-zA-Z])[']([a-zA-Z])", "$1 '$2").replaceAll("([0-9])[']([s])", "$1 '$2")).replaceAll("\\s+", Constant.BLANK).trim();
        while (trim.contains("DOTDOTMULTI")) {
            trim = trim.replace("DOTDOTMULTI", "DOTMULTI.");
        }
        String replace = trim.replace("DOTMULTI", ".");
        if (z) {
            replaceAll = replace.replaceAll("([0-9" + alpha + "])-(?=[0-9" + alpha + "])", "$1 @-@ ");
        } else {
            replaceAll = replace.replaceAll("([0-9" + alpha + "])-(?=[0-9" + alpha + "])", "$1 - ");
        }
        return Arrays.asList(replaceAll.replace("$ tag", TagSolverBuilder.TAG).split("\\s+"));
    }
}
