Source code for whatstk.whatsapp.auto_header

"""Detect header from chat."""


import logging
import re
import pandas as pd
from whatstk.utils.exceptions import RegexError


separators = {'.', ',', '-', '/', ':', '[', ']'}


[docs]def extract_header_from_text(text, encoding='utf-8'): """Extract header from text. Args: text (str): Loaded chat as string (whole text). encoding (str): Encoding to use for UTF when reading/writing (ex. ‘utf-8’). `List of Python standard encodings <https://docs.python.org/3/library/codecs.html#standard-encodings>`_. Returns: str: Format extracted. None if no header was extracted. Example: Load a chat using two text files. In this example, we use sample chats (available online, see urls in source code :mod:`whatstk.data <whatstk.data>`). .. code-block:: python >>> from whatstk.whatsapp.parser import extract_header_from_text >>> from urllib.request import urlopen >>> from whatstk.data import whatsapp_urls >>> filepath_1 = whatsapp_urls.POKEMON >>> with urlopen(filepath_1) as f: ... text = f.read().decode('utf-8') >>> extract_header_from_text(text) '%d.%m.%y, %H:%M - %name: """ # Split lines lines = text.split('\n') # Get format auto try: hformat = _extract_header_format_from_lines(lines) logging.info("Format found was %s", hformat) return hformat except: # noqa logging.info("Format not found.") return None
def _extract_header_format_from_lines(lines): """Extract header from list of lines. Args: lines (list): List of str, each element is a line of the loaded chat. Returns: str: Format of the header. """ # Obtain header format from list of lines elements_list, template_list = _extract_elements_template_from_lines(lines) return _extract_header_format_from_components(elements_list, template_list) def _extract_elements_template_from_lines(lines): """Get elements_list and template_list from lines. Args: lines (list): List with messages. Returns: tuple: elements_list (list), template_list (list) """ # Obtain header format from list of lines elements_list = [] template_list = [] for line in lines: header = _extract_possible_header_from_line(line) if header: try: elements, template = _extract_header_parts(header) except RegexError: continue elements_list.append(elements) template_list.append(template) return elements_list, template_list def _extract_possible_header_from_line(line): """Given a `line` extract possible header. Uses ':' as separator. Args: line (str): Line containing header and message body. Returns: str: Possible header. """ # Extract possible header from line line_split = line.split(': ') if len(line_split) >= 2: # possible header header = line_split[0] if not header.isprintable(): header = header.replace('\u200e', '').replace('\u202e', '') if header[-1] != ':': header += ':' return header return None def _extract_header_parts(header): """Extract all parts from header (i.e. date elements and name). Args: header (str): Header. Returns: tuple: Contains two elements, (i) list with components and (ii) string template which specifies the formatting of the components. """ def get_last_idx_digit(v, i): if i+1 < len(v): if v[i+1].isdigit(): return get_last_idx_digit(v, i+1) return i # def get_last_idx_alpha(v, i): # if i+1 < len(v): # if v[i+1].isalpha(): # return get_last_idx_alpha(v, i+1) # elif i+2 < len(v): # if v[i+1].isspace() and v[i+2].isalpha(): # return get_last_idx_alpha(v, i+2) # return i hformat_elements = [] hformat_template = '' i = 0 while i < len(header): if header[i].isdigit(): j = get_last_idx_digit(header, i) hformat_elements.append(int(header[i:j+1])) hformat_template += '{}' i = j else: if header[i] in ['[', ']']: hformat_template += '\\'+header[i] else: hformat_template += header[i] i += 1 items = re.findall(r'[-|\]]\s[^:]*:', hformat_template) if len(items) != 1: raise RegexError( "Username match was not possible. Check that header (%s) is of format '... - %name:' or '[...] %name:'", hformat_template) hformat_template = hformat_template.replace(items[0][2:-1], '%name') code = ' %p' hformat_template = hformat_template\ .replace(' PM', code)\ .replace(' AM', code)\ .replace(' A.M.', code)\ .replace(' P.M.', code)\ .replace(' am', code)\ .replace(' pm', code)\ .replace(' a.m.', code)\ .replace(' p.m.', code) return hformat_elements, hformat_template def _extract_header_format_from_components(elements_list, template_list): """Extract header format from list containing elements and list containing templates. Args: elements_list (list): List with component list. template_list (list): List with template strings. Returns: str: Header format. """ # Remove outliers elements_list_ = [] template_list_ = [] lengths = [len(e) for e in elements_list] types = ["".join([str(type(ee).__name__) for ee in e]) for e in elements_list] len_mode = max(set(lengths), key=lengths.count) type_mode = max(set(types), key=types.count) for e, t in zip(elements_list, template_list): if (len(e) == len_mode) and ("".join([str(type(ee).__name__) for ee in e]) == type_mode): elements_list_.append(e) template_list_.append(t) # Get positions df = pd.DataFrame(elements_list_) dates_df = df.select_dtypes(int) template = template_list[0] if '%p' in template: hour_code = "%I" else: hour_code = "%H" # day day_pos = ((dates_df.max() > 27) & (dates_df.max() < 32)).idxmax() dates_df = dates_df.drop(columns=[day_pos]) # year # year_pos = dates_df.std().idxmin() pos = [0, 1, 2] pos.remove(day_pos) year_pos = dates_df[pos].max().idxmax() # Only consider positions 0,1,2 dates_df = dates_df.drop(columns=[year_pos]) # Month month_pos = dates_df.columns.min() dates_df = dates_df.drop(columns=[month_pos]) # Hour hour_pos = 3 dates_df = dates_df.drop(columns=[hour_pos]) # Minute minutes_pos = 4 dates_df = dates_df.drop(columns=[minutes_pos]) # Dictionary with positions and date element code dates_pos = { day_pos: '%d', year_pos: '%y', month_pos: '%m', hour_pos: hour_code, minutes_pos: '%M' } # Seconds if dates_df.shape[1] > 0: seconds_pos = 5 dates_pos[seconds_pos] = '%S' keys_ordered = sorted(dates_pos.keys()) dates_codes = [dates_pos[k] for k in keys_ordered] codes = dates_codes + ['%name'] # print(codes) # print(template) # print(template) # print(codes) code_template = template.format(*codes) # print(code_template) # print('---------------') # print(code_template) return code_template