'''
Python module for handling data statistics
'''
from __future__ import annotations
from warnings import filterwarnings
filterwarnings('ignore')
import seaborn as sns
import pandas as pd
import numpy as np
from scipy.stats.stats import pearsonr
from itertools import cycle
from random import choice
from matplotlib import pyplot as plt
[docs]class Correlation:
r"""
A correlation class for pearson and chatterjee method of statistical significance.
Parameters
----------
df : pd.DataFrame
Takes in only the dataframe
"""
def __init__(self, dataframe:pd.DataFrame):
self._df = dataframe
def _chatterjee(self, x:pd.Series, y:pd.Series) -> float:
'''
A private method that implements chatterjee method
Return
------
correlation between two variable
'''
df = pd.DataFrame()
df['x_rk'] = x.rank()
df['y_rk'] = y.rank()
df = df.sort_values('x_rk')
sum_term = df['y_rk'].diff().abs().sum()
chatt_corr = (1 - 3 * sum_term / (pow(df.shape[0], 2) - 1))
return chatt_corr
[docs] def corr(self, method:str='chatterjee'):
r'''
Function to calculate the linear (Pearson's) and non-linear (Chatterjee's) relationships between log curves.
Relationship between well logs are usually non-linear.
Parameters
----------
method : str, default 'chatterjee'
Method of correlation. {'chatterjee', 'pearsonr', 'linear', 'nonlinear'}
* 'linear' is the same as 'pearsonr'
* 'nonlinear' is the same as 'chatterjee'
Returns
-------
Correlation matrix of all possible log curves combination
Example
-------
>>> corr = Correlation(df)
>>> v = corr.corr(method='chatterjee)
'''
self._method = method
X = self._df.columns.tolist()
Y = X.copy()
df = pd.DataFrame(index=X, columns=Y)
for i in X:
for j in Y:
if method == 'chatterjee' or method == 'nonlinear':
corr = self._chatterjee(self._df[i], self._df[j])
df[i][j] = corr
elif method=='pearsonr' or method == 'linear':
self._df = self._df.dropna()
corr, _ = pearsonr(self._df[i], self._df[j])
df[i][j] = corr
#convert the columns to numeric from object
for column in df.columns:
df[column] = df[column].astype(np.float32)
return df
[docs] def plot_heatmap(self, title:str='Correlation Heatmap', figsize:slice=(12, 7), annot:bool=True, cmap=None):
r'''
Plots the heat map of Correlation Matrix
Parameters
----------
title : str
Title of plot
figsize : slice
Size of plot
annot : bool, default True
To annotate the coefficient in the plot
cmap : matplotlib colormap name or object, or list of colors, optional
The mapping from data values to color space
Example
-------
>>> corr = Correlation(df)
>>> v = corr.corr(method='chatterjee)
>>> corr.plot_heatmap(cmap='Reds')
'''
corr = self.corr(self._method)
plt.rcParams['figure.figsize'] = figsize
plt.title(title)
sns.heatmap(corr, annot=annot, vmin=-1, vmax=1, cmap=cmap)
[docs]def displayFreq(df:pd.DataFrame, *cols:tuple[str], bins:int=12, figsize:slice=(8, 8)):
'''
Function to plot the frequency distribution of well log curves
Parameters
----------
df : pd.DataFrame
Dataframe of data
cols : tuple[str]
log curves to show its distribution
bins : int
Number of bins to group the data
figsize : slice
Size of plot
Returns
-------
Shows a plot of the frequency distribution of well log curves
Example
-------
>>> from petrolib.stats import displayFreq
>>> displayFreq(df, 'GR','CALI', 'COAL', 'DT', 'DT_LOG', bins=15, figsize=(20,10))
'''
#randomnly generated colors
cycol = cycle('bgrcmk')
color = [choice(next(cycol)) for i in range(len(cols))]
plt.subplots(nrows=1, ncols=len(cols), figsize=figsize)
plt.suptitle(f'Frequency Distribution', fontsize=20)
for i, col in enumerate(cols):
plt.subplot(2, len(cols)-(len(cols)//2), i+1)
df[col].plot.hist(bins=bins, color=color[i], alpha=0.5)
plt.grid(which='major', linestyle=':', linewidth='1', color='lightgray')
plt.title('Histogram of ' + col)
plt.ylabel('Frequency')# Set text for y axis
plt.xlabel(col.upper()) #set text for x axis