Source code for whatstk.analysis.interventions

"""Base analysis tools."""


import pandas as pd
from whatstk.utils.utils import COLNAMES_DF, _get_df


[docs]def get_interventions_count(df=None, chat=None, date_mode='date', msg_length=False, cummulative=False, all_users=False): """Get number of interventions per user per unit of time. The unit of time can be chosen by means of argument ``date_mode``. **Note**: Either ``df`` or ``chat`` must be provided. Args: df (pandas.DataFrame, optional): Chat data. Atribute `df` of a chat loaded using Chat. If a value is given, ``chat`` is ignored. chat (Chat, optional): Chat data. Object obtained when chat loaded using Chat. Required if ``df`` is None. date_mode (str, optional): Choose mode to group interventions by. Defaults to ``date_mode=date``. Available modes are: - ``'date'``: Grouped by particular date (year, month and day). - ``'hour'``: Grouped by day hours (24 hours). - ``'month'``: Grouped by months (12 months). - ``'weekday'``: Grouped by weekday (i.e. monday, tuesday, ..., sunday). - ``'hourweekday'``: Grouped by weekday and hour. msg_length (bool, optional): Set to True to count the number of characters instead of number of messages sent. cummulative (bool, optional): Set to True to obtain commulative counts. all_users (bool, optional): Obtain number of interventions of all users combined. Defaults to False. Returns: pandas.DataFrame: DataFrame with shape *NxU*, where *N*: number of time-slots and *U*: number of users. Raises: ValueError: if ``date_mode`` value is not supported. Example: Get number of interventions per user from `POKEMON chat <http://raw.githubusercontent.com/lucasrodes/whatstk/develop/chats/whatsapp/pokemon.txt>`_. The counts are represented as a `NxU` matrix, where `N`: number of time-slots and `U`: number of users. .. code-block:: python >>> from whatstk import WhatsAppChat >>> from whatstk.analysis import get_interventions_count >>> from whatstk.data import whatsapp_urls >>> filepath = whatsapp_urls.POKEMON >>> chat = WhatsAppChat.from_source(filepath) >>> counts = get_interventions_count(chat=chat, date_mode='date', msg_length=False) >>> counts.head(5) username Ash Ketchum Brock Jessie & James ... Prof. Oak Raichu Wobbuffet date ... 2016-08-06 2 2 0 ... 0 0 0 2016-08-07 1 1 0 ... 1 0 0 2016-08-10 1 0 1 ... 0 2 0 2016-08-11 0 0 0 ... 0 0 0 2016-09-11 0 0 0 ... 0 0 0 [5 rows x 8 columns] """ df = _get_df(df=df, chat=chat) if date_mode == 'date': n_interventions = _interventions(df, [df.index.date], msg_length) n_interventions.index = pd.to_datetime(n_interventions.index) elif date_mode == 'hour': n_interventions = _interventions(df, [df.index.hour], msg_length) elif date_mode == 'weekday': n_interventions = _interventions(df, [df.index.weekday], msg_length) elif date_mode == 'hourweekday': n_interventions = _interventions(df, [df.index.weekday, df.index.hour], msg_length) elif date_mode == 'month': n_interventions = _interventions(df, [df.index.month], msg_length) else: raise ValueError("Mode {} is not implemented. Valid modes are 'date', 'hour', 'weekday', " "'hourweekday' and 'month'.".format(date_mode)) if date_mode == 'hourweekday': n_interventions.index = n_interventions.index.set_names(['weekday', 'hour']) else: n_interventions.index.name = date_mode n_interventions.columns = n_interventions.columns.get_level_values(COLNAMES_DF.USERNAME) if all_users: n_interventions = pd.DataFrame(n_interventions.sum(axis=1), columns=['interventions count']) if cummulative: n_interventions = n_interventions.cumsum() return n_interventions
def _interventions(df, index_date, msg_length): """Get number of interventions per day per user. Args: df (pandas.DataFrame): Chat as DataFrame. Returns: pandas.DataFrame: Table with interventions per day per user. """ if msg_length: counts_ = df.copy() counts_[COLNAMES_DF.MESSAGE_LENGTH] = counts_[COLNAMES_DF.MESSAGE].apply(lambda x: len(x)) counts = counts_.groupby(by=index_date + [COLNAMES_DF.USERNAME]).agg({ COLNAMES_DF.MESSAGE_LENGTH: lambda x: x.sum() }) else: counts = df.groupby(by=index_date + [COLNAMES_DF.USERNAME]).agg('count') counts = counts.unstack(fill_value=0) return counts