Source code for whatstk.whatsapp.auto_header

"""Detect header from chat."""


import logging
import re
from typing import List, Tuple, Optional

import pandas as pd

from whatstk.utils.exceptions import RegexError


separators = {".", ",", "-", "/", ":", "[", "]"}


[docs]def extract_header_from_text(text: str, encoding: str = "utf-8") -> Optional[str]: """Extract header from text. Args: text (str): Loaded chat as string (whole text). encoding (str): Encoding to use for UTF when reading/writing (ex. ‘utf-8’). `List of Python standard encodings <https://docs.python.org/3/library/codecs.html#standard-encodings>`_. Returns: str: Format extracted. None if no header was extracted. Example: Load a chat using two text files. In this example, we use sample chats (available online, see urls in source code :mod:`whatstk.data <whatstk.data>`). .. code-block:: python >>> from whatstk.whatsapp.parser import extract_header_from_text >>> from urllib.request import urlopen >>> from whatstk.data import whatsapp_urls >>> filepath_1 = whatsapp_urls.POKEMON >>> with urlopen(filepath_1) as f: ... text = f.read().decode('utf-8') >>> extract_header_from_text(text) '%d.%m.%y, %H:%M - %name: """ # Split lines lines = text.split("\n") # Get format auto try: hformat = _extract_header_format_from_lines(lines) logging.info("Format found was %s", hformat) return hformat except Exception as err: # noqa logging.info("Format not found.") return None
def _extract_header_format_from_lines(lines: List[str]) -> str: """Extract header from list of lines. Args: lines (list): List of str, each element is a line of the loaded chat. Returns: str: Format of the header. """ # Obtain header format from list of lines elements_list, template_list = _extract_elements_template_from_lines(lines) return _extract_header_format_from_components(elements_list, template_list) def _extract_elements_template_from_lines(lines: str) -> Tuple[List[List[int]], List[str]]: """Get elements_list and template_list from lines. Args: lines (list): List with messages. Returns: tuple: elements_list (list), template_list (list) """ # Obtain header format from list of lines elements_list = [] template_list = [] for line in lines: header = _extract_possible_header_from_line(line) if header: try: elements, template = _extract_header_parts(header) except RegexError: continue elements_list.append(elements) template_list.append(template) return elements_list, template_list def _extract_possible_header_from_line(line: str) -> str: """Given a `line` extract possible header. Uses ':' as separator. Args: line (str): Line containing header and message body. Returns: str: Possible header. """ # Extract possible header from line line_split = line.split(": ") if len(line_split) >= 2: # possible header header = line_split[0] if not header.isprintable(): header = header.replace("\u200e", "").replace("\u202e", "") if header[-1] != ":": header += ":" return header return None def _extract_header_parts(header: str) -> Tuple[List[int], str]: """Extract all parts from header (i.e. date elements and name). Args: header (str): Header. Returns: tuple: Contains two elements, (i) list with components and (ii) string template which specifies the formatting of the components. """ def _get_last_idx_digit(v: str, i: int) -> int: if i + 1 < len(v): if v[i + 1].isdigit(): return _get_last_idx_digit(v, i + 1) return i # def get_last_idx_alpha(v, i): # if i+1 < len(v): # if v[i+1].isalpha(): # return get_last_idx_alpha(v, i+1) # elif i+2 < len(v): # if v[i+1].isspace() and v[i+2].isalpha(): # return get_last_idx_alpha(v, i+2) # return i hformat_elements = [] hformat_template = "" i = 0 while i < len(header): if header[i].isdigit(): j = _get_last_idx_digit(header, i) hformat_elements.append(int(header[i: j + 1])) hformat_template += "{}" i = j else: if header[i] in ["[", "]"]: hformat_template += "\\" + header[i] else: hformat_template += header[i] i += 1 items = re.findall(r"[-|\]]\s[^:]*:", hformat_template) if len(items) != 1: raise RegexError( "Username match was not possible. Check that header (%s) is of format '... - %name:' or '[...] %name:'", hformat_template, ) hformat_template = hformat_template.replace(items[0][2:-1], "%name") code = " %p" hformat_template = ( hformat_template.replace(" PM", code) .replace(" AM", code) .replace(" A.M.", code) .replace(" P.M.", code) .replace(" am", code) .replace(" pm", code) .replace(" a.m.", code) .replace(" p.m.", code) ) return hformat_elements, hformat_template def _extract_header_format_from_components(elements_list: List[List[int]], template_list: List[int]) -> str: """Extract header format from list containing elements and list containing templates. Args: elements_list (list): List with component list. template_list (list): List with template strings. Returns: str: Header format. """ # Remove outliers elements_list_ = [] template_list_ = [] lengths = [len(e) for e in elements_list] types = ["".join([str(type(ee).__name__) for ee in e]) for e in elements_list] len_mode = max(set(lengths), key=lengths.count) type_mode = max(set(types), key=types.count) for e, t in zip(elements_list, template_list): if (len(e) == len_mode) and ("".join([str(type(ee).__name__) for ee in e]) == type_mode): elements_list_.append(e) template_list_.append(t) # Get positions df = pd.DataFrame(elements_list_) # dates_df = df.select_dtypes(int) dates_df = df.select_dtypes("number") template = template_list[0] if "%p" in template: hour_code = "%I" else: hour_code = "%H" # day day_pos = ((dates_df.max() > 27) & (dates_df.max() < 32)).idxmax() dates_df = dates_df.drop(columns=[day_pos]) # year # year_pos = dates_df.std().idxmin() pos = [0, 1, 2] pos.remove(day_pos) year_pos = dates_df[pos].max().idxmax() # Only consider positions 0,1,2 dates_df = dates_df.drop(columns=[year_pos]) # Month month_pos = dates_df.columns.min() dates_df = dates_df.drop(columns=[month_pos]) # Hour hour_pos = 3 dates_df = dates_df.drop(columns=[hour_pos]) # Minute minutes_pos = 4 dates_df = dates_df.drop(columns=[minutes_pos]) # Dictionary with positions and date element code dates_pos = {day_pos: "%d", year_pos: "%y", month_pos: "%m", hour_pos: hour_code, minutes_pos: "%M"} # Seconds if dates_df.shape[1] > 0: seconds_pos = 5 dates_pos[seconds_pos] = "%S" keys_ordered = sorted(dates_pos.keys()) dates_codes = [dates_pos[k] for k in keys_ordered] codes = dates_codes + ["%name"] # print(codes) # print(template) # print(template) # print(codes) code_template = template.format(*codes) # print(code_template) # print('---------------') # print(code_template) return code_template