Source code for whatstk.analysis.interventions

"""Base analysis tools."""

from typing import TYPE_CHECKING, List

import pandas as pd
from whatstk.utils.utils import COLNAMES_DF, _get_df

if TYPE_CHECKING:  # pragma: no cover
    from whatstk._chat import BaseChat  # pragma: no cover


[docs]def get_interventions_count( df: pd.DataFrame = None, chat: "BaseChat" = None, date_mode: str = "date", msg_length: bool = False, cumulative: bool = False, all_users: bool = False, ) -> pd.DataFrame: """Get number of interventions per user per unit of time. The unit of time can be chosen by means of argument ``date_mode``. **Note**: Either ``df`` or ``chat`` must be provided. Args: df (pandas.DataFrame, optional): Chat data. Atribute `df` of a chat loaded using Chat. If a value is given, ``chat`` is ignored. chat (Chat, optional): Chat data. Object obtained when chat loaded using Chat. Required if ``df`` is None. date_mode (str, optional): Choose mode to group interventions by. Defaults to ``date_mode=date``. Available modes are: - ``'date'``: Grouped by particular date (year, month and day). - ``'hour'``: Grouped by day hours (24 hours). - ``'month'``: Grouped by months (12 months). - ``'weekday'``: Grouped by weekday (i.e. monday, tuesday, ..., sunday). - ``'hourweekday'``: Grouped by weekday and hour. msg_length (bool, optional): Set to True to count the number of characters instead of number of messages sent. cumulative (bool, optional): Set to True to obtain commulative counts. all_users (bool, optional): Obtain number of interventions of all users combined. Defaults to False. Returns: pandas.DataFrame: DataFrame with shape *NxU*, where *N*: number of time-slots and *U*: number of users. Raises: ValueError: if ``date_mode`` value is not supported. Example: Get number of interventions per user from `POKEMON chat <http://raw.githubusercontent.com/lucasrodes/whatstk/develop/chats/whatsapp/pokemon.txt>`_. The counts are represented as a `NxU` matrix, where `N`: number of time-slots and `U`: number of users. .. code-block:: python >>> from whatstk import WhatsAppChat >>> from whatstk.analysis import get_interventions_count >>> from whatstk.data import whatsapp_urls >>> filepath = whatsapp_urls.POKEMON >>> chat = WhatsAppChat.from_source(filepath) >>> counts = get_interventions_count(chat=chat, date_mode='date', msg_length=False) >>> counts.head(5) username Ash Ketchum Brock Jessie & James ... Prof. Oak Raichu Wobbuffet date ... 2016-08-06 2 2 0 ... 0 0 0 2016-08-07 1 1 0 ... 1 0 0 2016-08-10 1 0 1 ... 0 2 0 2016-08-11 0 0 0 ... 0 0 0 2016-09-11 0 0 0 ... 0 0 0 [5 rows x 8 columns] """ df = _get_df(df=df, chat=chat) if date_mode == "date": n_interventions = _interventions(df, [df[COLNAMES_DF.DATE].dt.date], msg_length) n_interventions.index = pd.to_datetime(n_interventions.index) # print(n_interventions.shape) elif date_mode == "hour": n_interventions = _interventions(df, [df[COLNAMES_DF.DATE].dt.hour], msg_length) elif date_mode == "weekday": n_interventions = _interventions(df, [df[COLNAMES_DF.DATE].dt.weekday], msg_length) elif date_mode == "hourweekday": n_interventions = _interventions( df, [df[COLNAMES_DF.DATE].dt.weekday, df[COLNAMES_DF.DATE].dt.hour], msg_length ) elif date_mode == "month": n_interventions = _interventions(df, [df[COLNAMES_DF.DATE].dt.month], msg_length) else: raise ValueError( "Mode {} is not implemented. Valid modes are 'date', 'hour', 'weekday', " "'hourweekday' and 'month'.".format(date_mode) ) if date_mode == "hourweekday": n_interventions.index = n_interventions.index.set_names(["weekday", "hour"]) else: n_interventions.index.name = date_mode n_interventions.columns = n_interventions.columns.get_level_values(COLNAMES_DF.USERNAME) if all_users: n_interventions = pd.DataFrame(n_interventions.sum(axis=1), columns=["interventions count"]) if cumulative: n_interventions = n_interventions.cumsum() return n_interventions
def _interventions(df: pd.DataFrame, series_tf: List[pd.DataFrame], msg_length: bool) -> pd.DataFrame: """Get number of interventions per date per user. Args: df (pandas.DataFrame): Chat as DataFrame. series_tf (list): List of pandas series with the date transformations applied, so we can group by, e.g., month. msg_length (bool, optional): Set to True to count the number of characters instead of number of messages sent. Returns: pandas.DataFrame: Table with interventions per day per user. """ if msg_length: counts_ = df.copy() counts_[COLNAMES_DF.MESSAGE_LENGTH] = counts_[COLNAMES_DF.MESSAGE].apply(lambda x: len(x)) counts = counts_.groupby(by=series_tf + [COLNAMES_DF.USERNAME]).agg( {COLNAMES_DF.MESSAGE_LENGTH: lambda x: x.sum()} ) else: counts = df.groupby(by=series_tf + [COLNAMES_DF.USERNAME]).agg({"message": "count"}) counts = counts.unstack(fill_value=0) return counts