Source code for stats

'''
Python module for handling data statistics

'''

from __future__ import annotations

from warnings import filterwarnings
filterwarnings('ignore')
import seaborn as sns
import pandas as pd
import numpy as np
from scipy.stats.stats import pearsonr
from itertools import cycle
from random import choice
from matplotlib import pyplot as plt


[docs]class Correlation: r""" A correlation class for pearson and chatterjee method of statistical significance. Parameters ---------- df : pd.DataFrame Takes in only the dataframe """ def __init__(self, dataframe:pd.DataFrame): self._df = dataframe def _chatterjee(self, x:pd.Series, y:pd.Series) -> float: ''' A private method that implements chatterjee method Return ------ correlation between two variable ''' df = pd.DataFrame() df['x_rk'] = x.rank() df['y_rk'] = y.rank() df = df.sort_values('x_rk') sum_term = df['y_rk'].diff().abs().sum() chatt_corr = (1 - 3 * sum_term / (pow(df.shape[0], 2) - 1)) return chatt_corr
[docs] def corr(self, method:str='chatterjee'): r''' Function to calculate the linear (Pearson's) and non-linear (Chatterjee's) relationships between log curves. Relationship between well logs are usually non-linear. Parameters ---------- method : str, default 'chatterjee' Method of correlation. {'chatterjee', 'pearsonr', 'linear', 'nonlinear'} * 'linear' is the same as 'pearsonr' * 'nonlinear' is the same as 'chatterjee' Returns ------- Correlation matrix of all possible log curves combination Example ------- >>> corr = Correlation(df) >>> v = corr.corr(method='chatterjee) ''' self._method = method X = self._df.columns.tolist() Y = X.copy() df = pd.DataFrame(index=X, columns=Y) for i in X: for j in Y: if method == 'chatterjee' or method == 'nonlinear': corr = self._chatterjee(self._df[i], self._df[j]) df[i][j] = corr elif method=='pearsonr' or method == 'linear': self._df = self._df.dropna() corr, _ = pearsonr(self._df[i], self._df[j]) df[i][j] = corr #convert the columns to numeric from object for column in df.columns: df[column] = df[column].astype(np.float32) return df
[docs] def plot_heatmap(self, title:str='Correlation Heatmap', figsize:slice=(12, 7), annot:bool=True, cmap=None): r''' Plots the heat map of Correlation Matrix Parameters ---------- title : str Title of plot figsize : slice Size of plot annot : bool, default True To annotate the coefficient in the plot cmap : matplotlib colormap name or object, or list of colors, optional The mapping from data values to color space Example ------- >>> corr = Correlation(df) >>> v = corr.corr(method='chatterjee) >>> corr.plot_heatmap(cmap='Reds') ''' corr = self.corr(self._method) plt.rcParams['figure.figsize'] = figsize plt.title(title) sns.heatmap(corr, annot=annot, vmin=-1, vmax=1, cmap=cmap)
[docs]def displayFreq(df:pd.DataFrame, *cols:tuple[str], bins:int=12, figsize:slice=(8, 8)): ''' Function to plot the frequency distribution of well log curves Parameters ---------- df : pd.DataFrame Dataframe of data cols : tuple[str] log curves to show its distribution bins : int Number of bins to group the data figsize : slice Size of plot Returns ------- Shows a plot of the frequency distribution of well log curves Example ------- >>> from petrolib.stats import displayFreq >>> displayFreq(df, 'GR','CALI', 'COAL', 'DT', 'DT_LOG', bins=15, figsize=(20,10)) ''' #randomnly generated colors cycol = cycle('bgrcmk') color = [choice(next(cycol)) for i in range(len(cols))] plt.subplots(nrows=1, ncols=len(cols), figsize=figsize) plt.suptitle(f'Frequency Distribution', fontsize=20) for i, col in enumerate(cols): plt.subplot(2, len(cols)-(len(cols)//2), i+1) df[col].plot.hist(bins=bins, color=color[i], alpha=0.5) plt.grid(which='major', linestyle=':', linewidth='1', color='lightgray') plt.title('Histogram of ' + col) plt.ylabel('Frequency')# Set text for y axis plt.xlabel(col.upper()) #set text for x axis