Source code for pycognaize.common.numeric_parser

"""Defines NumericParser for parsing a number string into a float values"""
import re
from typing import Union
from math import copysign



[docs]
class NumericParser:
    """
    Parser for any kind of text input into a corresponding format.
    Numeric data is assumed to be valid and clean.
    Use NumericColumnFormatter for validating your data.
    """
    numeric = (r"(^0+$|^[\\[\\(+-]*(([1-9][0-9]{0,2}([.,][0-9]{3})*)|[0-9]"
               r"|[1-9][0-9]+)([,.][0-9]*)?[\]\)]*$)")
    delimiters = r"[.,]"
    REGEX_NO_ALPHANUM_CHARS = re.compile(r'[^a-zA-Z0-9)\[\](-.,]')

    brackets = r"[\[\]\(\)]"
    # match a pattern with whitespace followed by either a single comma
    #   from the left or whitespace surrounded by numbers
    single_white_space_regex = "(?<=\\d)(,{0,1}) {1,2}(?=\\d)"

    def __init__(self, raw: str):
        """Takes string as an input and create ValueParser object
        :param raw: input raw string representation of the number

        """

        raw = re.sub(self.single_white_space_regex, '', raw)
        raw = raw.replace('$', '').strip()
        self.raw = raw
        self.format = None
        self.parsed = None
        self.value_type = None
        self._separators = None
        self._separated_digits = None
        self.sign = None
        self.removed_sign = False

    def __get_separated_digits_and_separators(self):
        """Assigns the list of separators and separated digits to self.
            Ex. self.raw=12,345.6
            [',','.']->self.separators
            ['12', '345', '6']->self._separated_digits
            Applicable only if type is numeric
        """
        # TODO: check if strip_value is used properly
        val = self.REGEX_NO_ALPHANUM_CHARS.sub('', self.raw)
        val = re.sub(self.brackets, '', val)
        val = self.strip_value(val)
        # strip from multiple zeros. ex: 000 -> 0
        val = val[:-1].lstrip('0') + val[-1]
        self._separators = re.findall(self.delimiters, val)
        self._separated_digits = re.split(self.delimiters, val)


[docs]
    def infer_sign(self):
        """
        Infer the sign of the number either with brackets
        or a trailing minus sign"""
        self.sign = 1
        if self.raw.startswith('(') and self.raw.endswith(')'):
            self.sign = -1
            self.remove_sign_parentheses()

        elif self.raw.lstrip().startswith('-'):
            self.sign = -1
            self.remove_dash_sign()


    def remove_dash_sign(self):
        self.raw = self.raw.lstrip().lstrip('-')
        self.removed_sign = True

    def remove_sign_parentheses(self):
        self.raw = self.raw.lstrip('(')
        self.raw = self.raw.rstrip(')')
        self.removed_sign = True


[docs]
    def parse_regular_float(self):
        """Parse Like a regular float number"""
        # noinspection PyBroadException
        try:
            # try parsing like a regular float number
            # if it doesn't work use more complicated logic
            delimiter = any(re.findall(self.delimiters, self.raw))
            decimal_part = len(re.split(self.delimiters, self.raw)[-1]) if\
                delimiter else 0
            if decimal_part < 3:
                self.parsed = float(self.raw.strip()) * self.sign
            elif decimal_part > 2:
                return None
            return self.parsed
        except Exception:
            return None



[docs]
    def parse_regular_float_with_semicolon(self):
        """Parse Like a regular float number with semicolon"""
        # noinspection PyBroadException
        try:
            if self.raw.strip().startswith('0'):
                # try parsing like a regular float number after replacing
                # commas if it doesn't work use more complicated logic
                if self.removed_sign:
                    self.parsed = float(self.raw.strip().replace(',', '.')) \
                                  * self.sign
                else:
                    self.parsed = float(self.raw.strip().replace(',', '.'))

                return self.parsed
        except Exception:
            return None



[docs]
    def parse_numeric(self) -> Union[int, float]:
        """
        Parse the raw data, and return parsed numeric value
        and get the numeric, separator and value type of the raw data

        :return: Parsed float or int value
        """
        self.raw = self.raw.replace('–', '-')
        self.infer_sign()

        parse_float = self.parse_regular_float()
        if parse_float is not None:
            return parse_float
        else:
            parse_float_semicolon = self.parse_regular_float_with_semicolon()
            if parse_float_semicolon is not None:
                return parse_float_semicolon

        if '%' in self.raw:
            return float('nan')

        try:
            self.__get_separated_digits_and_separators()

            self.parsed = self.parse_raw_numeric()
            self.parsed = self.parsed * self.sign
        except Exception:
            return float('nan')
        return self.parsed



[docs]
    def parse_raw_numeric(self) -> Union[int, float]:
        """Used for parsing raw text without getting any context
            from the column
        """
        if len(self._separators) == 0:
            return int(self._separated_digits[0])
        elif len(self._separated_digits[-1]) < 3:
            int_part = int(''.join(self._separated_digits[:-1]))
            dec_part = float('0.' + self._separated_digits[-1])
            dec_part = copysign(dec_part, int_part)
            return int_part + dec_part
        elif len(self._separators) > 1 and self._separators[0] != \
                self._separators[-1] and len(self._separated_digits[-1]) < 3:
            int_part = int(''.join(self._separated_digits[:-1]))
            dec_part = float('0.' + self._separated_digits[-1])
            dec_part = copysign(dec_part, int_part)
            return int_part + dec_part
        else:
            return int(''.join(self._separated_digits))



[docs]
    @staticmethod
    def strip_value(text: str):
        """Strip non-numeric characters from a string representing a number
        :param str text: Input string representing a number
        :return: String with stripped non-numeric characters
        :rtype str:
        """
        currencies = u'$¢£¤¥֏؋৲৳৻૱௹฿៛₠₡₢₣₤₥₦₧₨₩₪₫€₭₮₯₰₱₲₳₴₵₶₷₸₹₺₻₼₽₾꠸﷼﹩＄￠￡￥￦'
        text = text.strip(
            '\n\r\t ' + currencies + "'+*/%^_:;'\"\\|").rstrip('—-.,')
        text = text.replace(' ', '')
        return text



[docs]
    def is_numeric(self) -> bool:
        """Determines if the text is a representation of a numeric value

        :return: True if the raw text corresponds to numeric representation
        """
        if re.match(self.numeric,
                    self.strip_value(self.raw)) and '%' not in self.raw:
            return True
        return False