diff --git a/Chapter 5 Statistical Hypothesis Testing.ipynb b/Chapter 5 Statistical Hypothesis Testing.ipynb new file mode 100644 index 0000000..9feca34 --- /dev/null +++ b/Chapter 5 Statistical Hypothesis Testing.ipynb @@ -0,0 +1,1175 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Chapter 5: Statistical Hypothesis Testing.ipynb", + "provenance": [], + "collapsed_sections": [ + "WJhmgDxsVHEO", + "MkgcD5YRqY2t", + "YdiAgoIkqkgY", + "zPLd6dYaxMkT", + "OJMPHhRcyblx", + "CvS6L1Js4duu", + "7ob5bPMC7wnc", + "i9LHPNrU_aNd", + "LROas1z3DtJL", + "7c95dMu8HLoA", + "QFnla-p6YV94", + "7JMLIct9Ydku", + "IWdTeJz-ZuWc", + "6sR1d3BHS9Hh", + "_beFJ80gTH85", + "uXF1GqteUu0E", + "PpZ12cY4Vv_5", + "uCyetBhlW3E3" + ] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# **Chapter 5: Statistical Hypothesis Testing**" + ], + "metadata": { + "id": "RZ4sKXgZqP5R" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Table of Content:**\n", + "\n", + "- [Import Libraries](#Import_Libraries)\n", + "- [5.1. Normality Tests](#Normality_Tests)\n", + " - [5.1.1. Shapiro-Wilk Test](#Shapiro-Wilk_Test)\n", + " - [5.1.2. D’Agostino’s $K^2$ Test](#D’Agostino’s_Test)\n", + " - [5.1.3. Anderson-Darling Test](#Anderson-Darling_Test)\n", + "- [5.2. Correlation Tests](#Correlation_Tests)\n", + " - [5.2.1. Pearson’s Correlation Coefficient](#Pearson’s_Correlation_Coefficient)\n", + " - [5.2.2. Spearman’s Rank Correlation](#Spearman’s_Rank_Correlation)\n", + " - [5.2.3. Kendall’s Rank Correlation](#Kendall’s_Rank_Correlation)\n", + " - [5.2.4. Chi-Squared Test](#Chi-Squared_Test)\n", + "- [5.3. Stationary Tests](#Stationary_Tests)\n", + " - [5.3.1. Augmented Dickey-Fuller Unit Root Test](#Augmented_Dickey-Fuller_Unit_Root_Test)\n", + " - [5.3.2. Kwiatkowski-Phillips-Schmidt-Shin Test](#Kwiatkowski-Phillips-Schmidt-Shin_Test) \n", + "- [5.4. Other Tests](#Other_Tests)\n", + " - [5.4.1. Mann-Whitney U-Test](#Mann-Whitney_U-Test)\n", + " - [5.4.2. Wilcoxon Signed-Rank Test](#Wilcoxon_Signed-Rank-Test)\n", + " - [5.4.3. Kruskal-Wallis H Test](#Kruskal-Wallis_H_Test)\n", + " - [5.4.4. Friedman Test](#Friedman_Test) " + ], + "metadata": { + "id": "V9XOxBPqCABJ" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "## **Import Libraries**" + ], + "metadata": { + "id": "WJhmgDxsVHEO" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install --upgrade scipy" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vXStgb2JU6c0", + "outputId": "f227901d-8551-4fde-bb9a-4bc5e3bef1ab" + }, + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (1.4.1)\n", + "Collecting scipy\n", + " Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)\n", + "\u001b[K |████████████████████████████████| 38.1 MB 1.3 MB/s \n", + "\u001b[?25hRequirement already satisfied: numpy<1.23.0,>=1.16.5 in /usr/local/lib/python3.7/dist-packages (from scipy) (1.21.6)\n", + "Installing collected packages: scipy\n", + " Attempting uninstall: scipy\n", + " Found existing installation: scipy 1.4.1\n", + " Uninstalling scipy-1.4.1:\n", + " Successfully uninstalled scipy-1.4.1\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.\u001b[0m\n", + "Successfully installed scipy-1.7.3\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.patches as mpatches\n", + "import seaborn as sns\n", + "import math\n", + "from scipy import stats\n", + "from scipy.stats import norm\n", + "from scipy.stats import chi2\n", + "from scipy.stats import t\n", + "from scipy.stats import f\n", + "from scipy.stats import bernoulli\n", + "from scipy.stats import binom\n", + "from scipy.stats import nbinom\n", + "from scipy.stats import geom\n", + "from scipy.stats import poisson\n", + "from scipy.stats import uniform\n", + "from scipy.stats import randint\n", + "from scipy.stats import expon\n", + "from scipy.stats import gamma\n", + "from scipy.stats import beta\n", + "from scipy.stats import weibull_min\n", + "from scipy.stats import hypergeom\n", + "from scipy.stats import shapiro\n", + "from scipy.stats import pearsonr\n", + "from scipy.stats import normaltest\n", + "from scipy.stats import anderson\n", + "from scipy.stats import spearmanr\n", + "from scipy.stats import kendalltau\n", + "from scipy.stats import chi2_contingency\n", + "from scipy.stats import ttest_ind\n", + "from scipy.stats import ttest_rel\n", + "from scipy.stats import mannwhitneyu\n", + "from scipy.stats import wilcoxon\n", + "from scipy.stats import kruskal\n", + "from scipy.stats import friedmanchisquare\n", + "from statsmodels.tsa.stattools import adfuller\n", + "from statsmodels.tsa.stattools import kpss\n", + "from statsmodels.stats.weightstats import ztest\n", + "from scipy.integrate import quad\n", + "from IPython.display import display, Latex\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZPuphzTmU-P8", + "outputId": "db6527ea-c413-4457-e747-8f817f99006b" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n", + " import pandas.util.testing as tm\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "## **5.1. Normality Tests:**" + ], + "metadata": { + "id": "MkgcD5YRqY2t" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "### **5.1.1. Shapiro-Wilk Test:**" + ], + "metadata": { + "id": "YdiAgoIkqkgY" + } + }, + { + "cell_type": "markdown", + "source": [ + "$H_0$ : The sample has a Normal (Gaussian) distribution\n", + "\n", + "$H_1$ : The sample does not have a Normal (Gaussian) distribution.\n", + "\n", + "Assumptions: \n", + "* Observations in each sample are independent and identically distributed (iid).\n", + "\n", + "$\\\\ $\n", + "\n", + "[Shapiro-Wilk Test Doc](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.shapiro.html)" + ], + "metadata": { + "id": "5THGAbaPqrjg" + } + }, + { + "cell_type": "code", + "source": [ + "N = 100\n", + "alpha = 0.05\n", + "np.random.seed(1)\n", + "data = np.random.normal(0, 1, N)\n", + "\n", + "Test_statistic, p_value = shapiro(data)\n", + "print(f'Test_statistic_shapiro = {Test_statistic}, p_value = {p_value}', '\\n')\n", + "\n", + "if p_value < alpha:\n", + "\tprint(f'Since p_value < {alpha}, reject null hypothesis. Therefore, The data is probably normal.')\n", + "else:\n", + "\tprint(f'Since p_value > {alpha}, the null hypothesis cannot be rejected. Therefore, The data is not probably normal.')" + ], + "metadata": { + "id": "oEXRlb4QwyRK", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "c857ada2-317e-4851-b71f-356d76a9cb22" + }, + "execution_count": 3, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test_statistic_shapiro = 0.9920045137405396, p_value = 0.8215526342391968 \n", + "\n", + "Since p_value > 0.05, the null hypothesis cannot be rejected. Therefore, The data is not probably normal.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "### **5.1.2. D’Agostino’s $K^2$ Test:**" + ], + "metadata": { + "id": "zPLd6dYaxMkT" + } + }, + { + "cell_type": "markdown", + "source": [ + "$H_0$ : The sample has a Normal (Gaussian) distribution\n", + "\n", + "$H_1$ : The sample does not have a Normal (Gaussian) distribution.\n", + "\n", + "Assumptions: \n", + "* Observations in each sample are independent and identically distributed (iid).\n", + "\n", + "\n", + "\n", + "\n", + "$\\\\ $\n", + "\n", + "[D’Agostino’s $K^2$ Test Doc](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.normaltest.html)" + ], + "metadata": { + "id": "AZnwcLLAxVJI" + } + }, + { + "cell_type": "code", + "source": [ + "N = 100\n", + "alpha = 0.05\n", + "np.random.seed(1)\n", + "data = np.random.normal(0, 1, N)\n", + "\n", + "Test_statistic, p_value = normaltest(data)\n", + "print(f\"Test_statistic_D'Agostino's K-squared = {Test_statistic}, p_value = {p_value}\", \"\\n\")\n", + "\n", + "if p_value < alpha:\n", + "\tprint(f'Since p_value < {alpha}, reject null hypothesis. Therefore, The data is probably normal.')\n", + "else:\n", + "\tprint(f'Since p_value > {alpha}, the null hypothesis cannot be rejected. Therefore, The data is not probably normal.')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "f5f2c919-1de4-415d-a57e-d478d55e4456", + "id": "AwDtLtHkxVJK" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test_statistic_D'Agostino's K-squared = 0.10202388832581702, p_value = 0.9502673203169621 \n", + "\n", + "Since p_value > 0.05, the null hypothesis cannot be rejected. Therefore, The data is not probably normal.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "### **5.1.3. Anderson-Darling Test:**" + ], + "metadata": { + "id": "OJMPHhRcyblx" + } + }, + { + "cell_type": "markdown", + "source": [ + "$H_0$ : The sample has a Normal (Gaussian) distribution\n", + "\n", + "$H_1$ : The sample does not have a Normal (Gaussian) distribution.\n", + "\n", + "Assumptions: \n", + "* Observations in each sample are independent and identically distributed (iid).\n", + "\n", + "$\\\\ $\n", + "\n", + "[Anderson-Darling Test Doc](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.anderson.html)\n", + "\n", + "Critical values provided are for the following significance levels:\n", + "\n", + "normal/exponential:\n", + "\n", + "$15\\%, 10\\%, 5\\%, 2.5\\%, 1\\%$\n", + "\n", + "logistic:\n", + "\n", + "$25\\%, 10\\%, 5\\%, 2.5\\%, 1\\%, 0.5\\%$\n", + "\n", + "Gumbel:\n", + "\n", + "$25\\%, 10\\%, 5\\%, 2.5\\%, 1\\%$\n", + "\n", + "If the test statistic is larger than these critical values then for the corresponding significance level, the null hypothesis that the data come from the chosen distribution can be rejected." + ], + "metadata": { + "id": "gU9cF9z9ybly" + } + }, + { + "cell_type": "code", + "source": [ + "N = 100\n", + "np.random.seed(1)\n", + "data = np.random.normal(0, 1, N)\n", + "\n", + "Test_statistic, critical_values, significance_level = anderson(data, dist='norm')\n", + "print(f'Test_statistic_anderson = {Test_statistic}', '\\n')\n", + "\n", + "for i in range(len(critical_values)):\n", + " sl, cv = significance_level[i], critical_values[i]\n", + " if Test_statistic > cv:\n", + " print(f'(Test statistic = {Test_statistic}) > (critical value = {sl}%), therefore for the corresponding significance level, the null hpothesis cannot be rejected.')\n", + " else:\n", + " print(f'(Test statistic = {Test_statistic}) > (critical value = {sl}%), therefore for the corresponding significance level, the null hpothesis is rejected.')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "G041pJhiy4ra", + "outputId": "eb4d9936-b41a-4218-865d-eed99eb45eb7" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test_statistic_anderson = 0.2196508855594459 \n", + "\n", + "(Test statistic = 0.2196508855594459) > (critical value = 15.0%), therefore for the corresponding significance level, the null hpothesis is rejected.\n", + "(Test statistic = 0.2196508855594459) > (critical value = 10.0%), therefore for the corresponding significance level, the null hpothesis is rejected.\n", + "(Test statistic = 0.2196508855594459) > (critical value = 5.0%), therefore for the corresponding significance level, the null hpothesis is rejected.\n", + "(Test statistic = 0.2196508855594459) > (critical value = 2.5%), therefore for the corresponding significance level, the null hpothesis is rejected.\n", + "(Test statistic = 0.2196508855594459) > (critical value = 1.0%), therefore for the corresponding significance level, the null hpothesis is rejected.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Note that you can use Anderson-Darling test for other distributions. \n", + "\n", + "The valid values are: {‘norm’, ‘expon’, ‘logistic’, ‘gumbel’, ‘gumbel_l’, ‘gumbel_r’, ‘extreme1’}" + ], + "metadata": { + "id": "JJ5mE7YozixQ" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "## **5.2. Correlation Tests:**" + ], + "metadata": { + "id": "CvS6L1Js4duu" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "### **5.2.1. Pearson’s Correlation Coefficient:**" + ], + "metadata": { + "id": "7ob5bPMC7wnc" + } + }, + { + "cell_type": "markdown", + "source": [ + "Tests whether two data sample have a linear relationship.\n", + "\n", + "$H_0$: The two data are independent.\n", + "\n", + "$H_1$: There is a dependency between the two data.\n", + "\n", + "Assumptions:\n", + "* Observations in each data sample are independent and identically distributed (iid).\n", + "* Observations in each data sample are normally distributed.\n", + "* Observations in each data sample have the same variance.\n", + "\n", + "$\\\\ $\n", + "\n", + "[Pearson’s Correlation Coefficient Doc](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.pearsonr.html)" + ], + "metadata": { + "id": "yNCyRmLS8NIj" + } + }, + { + "cell_type": "code", + "source": [ + "N = 10\n", + "alpha = 0.05\n", + "np.random.seed(1)\n", + "data1 = np.random.normal(0, 1, N)\n", + "data2 = np.random.normal(0, 1, N) + 2\n", + "\n", + "Test_statistic, p_value = pearsonr(data1, data2)\n", + "print(f\"Test_statistic_Pearson's Correlation = {Test_statistic}, p_value = {p_value}\", \"\\n\")\n", + "\n", + "if p_value < alpha:\n", + "\tprint(f'Since p_value < {alpha}, reject null hypothesis. Therefore, Two data are probably dependent.')\n", + "else:\n", + "\tprint(f'Since p_value > {alpha}, the null hypothesis cannot be rejected. Therefore, Two data are probably independent.')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "P6ENjsEd0lbp", + "outputId": "2eb12605-e669-466e-91ff-905169eb6995" + }, + "execution_count": 6, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test_statistic_Pearson's Correlation = 0.6556177144470315, p_value = 0.03957633895447448 \n", + "\n", + "Since p_value < 0.05, reject null hypothesis. Therefore, Two data are probably dependent.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "This test is parametric." + ], + "metadata": { + "id": "WbRpE09hGQXz" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "### **5.2.2. Spearman’s Rank Correlation:**" + ], + "metadata": { + "id": "i9LHPNrU_aNd" + } + }, + { + "cell_type": "markdown", + "source": [ + "Tests whether two data samples have a monotonic relationship.\n", + "\n", + "$H_0$: The two data are independent.\n", + "\n", + "$H_1$: There is a dependency between the two data.\n", + "\n", + "Assumptions:\n", + "* Observations in each data sample are independent and identically distributed (iid).\n", + "* Observations in each data sample can be ranked.\n", + "\n", + "$\\\\ $\n", + "\n", + "[Spearman’s Rank Correlation Doc](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.spearmanr.html)" + ], + "metadata": { + "id": "s4Itzt6K_kaj" + } + }, + { + "cell_type": "code", + "source": [ + "N = 10\n", + "alpha = 0.05\n", + "np.random.seed(1)\n", + "data1 = np.random.normal(0, 1, N)\n", + "data2 = np.random.normal(0, 1, N) + 2\n", + "\n", + "Test_statistic, p_value = spearmanr(data1, data2, alternative = 'two-sided')\n", + "print(f\"Test_statistic_Spearman's Rank Correlation = {Test_statistic}, p_value = {p_value}\", \"\\n\")\n", + "\n", + "if p_value < alpha:\n", + "\tprint(f'Since p_value < {alpha}, reject null hypothesis. Therefore, Two data are probably dependent.')\n", + "else:\n", + "\tprint(f'Since p_value > {alpha}, the null hypothesis cannot be rejected. Therefore, Two data are probably independent.')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vSdyARWiAQuh", + "outputId": "17841159-7ace-4bc8-8cae-18dc3d2bccd5" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test_statistic_Spearman's Rank Correlation = 0.7818181818181817, p_value = 0.007547007781067878 \n", + "\n", + "Since p_value < 0.05, reject null hypothesis. Therefore, Two data are probably dependent.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "Alternative hypothesis can be {‘two-sided’, ‘less’, ‘greater’}.\n", + "\n", + "'two-sided': the correlation is non-zero\n", + "\n", + "'less': the correlation is negative (less than zero)\n", + "\n", + "'greater': the correlation is positive (greater than zero)" + ], + "metadata": { + "id": "wibHxhC4BO5b" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "### **5.2.3. Kendall’s Rank Correlation:**" + ], + "metadata": { + "id": "LROas1z3DtJL" + } + }, + { + "cell_type": "markdown", + "source": [ + "Tests whether two data samples have a monotonic relationship.\n", + "\n", + "$H_0$: The two data are independent.\n", + "\n", + "$H_1$: There is a dependency between the two data.\n", + "\n", + "Assumptions:\n", + "* Observations in each data sample are independent and identically distributed (iid).\n", + "* Observations in each data sample can be ranked.\n", + "\n", + "$\\\\ $\n", + "\n", + "[Kendall’s Rank Correlation Doc](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kendalltau.html)" + ], + "metadata": { + "id": "084h0SpxDzCX" + } + }, + { + "cell_type": "code", + "source": [ + "N = 10\n", + "alpha = 0.05\n", + "np.random.seed(1)\n", + "data1 = np.random.normal(0, 1, N)\n", + "data2 = np.random.normal(0, 1, N) + 2\n", + "\n", + "Test_statistic, p_value = kendalltau(data1, data2)\n", + "print(f\"Test_statistic_Kendall's Rank Correlation = {Test_statistic}, p_value = {p_value}\", \"\\n\")\n", + "\n", + "if p_value < alpha:\n", + "\tprint(f'Since p_value < {alpha}, reject null hypothesis. Therefore, Two data are probably dependent.')\n", + "else:\n", + "\tprint(f'Since p_value > {alpha}, the null hypothesis cannot be rejected. Therefore, Two data are probably independent.')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Gzxj53vaEUzF", + "outputId": "0f7af409-1a8d-4dbe-f656-63902010e85c" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test_statistic_Kendall's Rank Correlation = 0.6, p_value = 0.016666115520282188 \n", + "\n", + "Since p_value < 0.05, reject null hypothesis. Therefore, Two data are probably dependent.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "### **5.2.4. Chi-Squared Test:**" + ], + "metadata": { + "id": "7c95dMu8HLoA" + } + }, + { + "cell_type": "markdown", + "source": [ + "Tests whether two categorical variables are related or independent.\n", + "\n", + "$H_0$: The two data are independent.\n", + "\n", + "$H_1$: There is a dependency between the two data.\n", + "\n", + "Assumptions:\n", + "* Observations used in the calculation of the contingency table are independent.\n", + "* 25 or more examples in each cell of the contingency table.\n", + "\n", + "$\\\\ $\n", + "\n", + "[Chi-Squared Doc](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html)\n", + "\n", + "$\\\\ $\n", + "\n", + "degrees of freedom: $(rows - 1) * (cols - 1)$" + ], + "metadata": { + "id": "tcV0_bRnHRI8" + } + }, + { + "cell_type": "code", + "source": [ + "N = 10\n", + "alpha = 0.05\n", + "table = [[10, 20, 30],\n", + "\t\t\t [6, 9, 17]]\n", + "\n", + "Test_statistic, p_value, dof, expected = chi2_contingency(table)\n", + "print(f\"Test_statistic_Chi-Squared = {Test_statistic}, p_value = {p_value}, df = {dof}, \\n\", f\"Expected = {expected}\",\"\\n\")\n", + "\n", + "if p_value < alpha:\n", + "\tprint(f'Since p_value < {alpha}, reject null hypothesis. Therefore, Two data are probably dependent.')\n", + "else:\n", + "\tprint(f'Since p_value > {alpha}, the null hypothesis cannot be rejected. Therefore, Two data are probably independent.')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "0Ik8RPjBHOwr", + "outputId": "aa7be138-2f00-429d-f6ba-eda5f09a94fd" + }, + "execution_count": 9, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test_statistic_Chi-Squared = 0.27157465150403504, p_value = 0.873028283380073, df = 2, \n", + " Expected = [[10.43478261 18.91304348 30.65217391]\n", + " [ 5.56521739 10.08695652 16.34782609]] \n", + "\n", + "Since p_value > 0.05, the null hypothesis cannot be rejected. Therefore, Two data are probably independent.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "## **5.3. Stationary Tests:**" + ], + "metadata": { + "id": "QFnla-p6YV94" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "### **5.3.1. Augmented Dickey-Fuller Unit Root Test:**" + ], + "metadata": { + "id": "7JMLIct9Ydku" + } + }, + { + "cell_type": "markdown", + "source": [ + "Tests whether a time series has a unit root, e.g. has a trend or more generally is autoregressive.\n", + "\n", + "$H_0$: A unit root is present (series is non-stationary).\n", + "\n", + "$H_1$: A unit root is not present (series is stationary).\n", + "\n", + "Assumptions:\n", + "* Observations in are temporally ordered.\n", + "\n", + "$\\\\ $\n", + "\n", + "[Augmented Dickey-Fuller Unit Root Test Doc](https://www.statsmodels.org/dev/generated/statsmodels.tsa.stattools.adfuller.html)" + ], + "metadata": { + "id": "FbuyfZBIYoPj" + } + }, + { + "cell_type": "code", + "source": [ + "alpha = 0.05\n", + "data = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]\n", + "\n", + "Test_statistic, p_value, lags, obs, crit, t = adfuller(data)\n", + "print(f\"Test_statistic_Mann-Whitney = {Test_statistic}, p_value = {p_value}\", \"\\n\")\n", + "\n", + "if p_value < alpha:\n", + "\tprint(f'Since p_value < {alpha}, reject null hypothesis. Therefore, the series is probably stationary.')\n", + "else:\n", + "\tprint(f'Since p_value > {alpha}, the null hypothesis cannot be rejected. Therefore, the series is probably non-stationary.')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "HnKfeoZfYXPt", + "outputId": "ad4c4075-ecda-4be5-f482-940bb90840b1" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test_statistic_Mann-Whitney = 0.5171974540944098, p_value = 0.9853865316323872 \n", + "\n", + "Since p_value > 0.05, the null hypothesis cannot be rejected. Therefore, the series is probably non-stationary.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "### **5.3.2. Kwiatkowski-Phillips-Schmidt-Shin Test:**" + ], + "metadata": { + "id": "IWdTeJz-ZuWc" + } + }, + { + "cell_type": "markdown", + "source": [ + "Tests whether a time series is trend stationary or not.\n", + "\n", + "$H_0$: The time series is trend-stationary.\n", + "\n", + "$H_1$: The time series is not trend-stationary.\n", + "\n", + "Assumptions:\n", + "* Observations in are temporally ordered.\n", + "\n", + "$\\\\ $\n", + "\n", + "[Kwiatkowski-Phillips-Schmidt-Shin Test Doc](https://www.statsmodels.org/stable/generated/statsmodels.tsa.stattools.kpss.html#statsmodels.tsa.stattools.kpss)" + ], + "metadata": { + "id": "IrK61Uv-ZuWc" + } + }, + { + "cell_type": "code", + "source": [ + "alpha = 0.05\n", + "data = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]\n", + "\n", + "Test_statistic, p_value, lags, crit = kpss(data)\n", + "print(f\"Test_statistic_Kwiatkowski = {Test_statistic}, p_value = {p_value}\", \"\\n\")\n", + "\n", + "if p_value < alpha:\n", + "\tprint(f'Since p_value < {alpha}, reject null hypothesis. Therefore, the series is probably not trend-stationary.')\n", + "else:\n", + "\tprint(f'Since p_value > {alpha}, the null hypothesis cannot be rejected. Therefore, the series is probably trend-stationary.')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "d1d6462e-7130-418d-b277-e34f419ad68b", + "id": "EtboNoivZuWd" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test_statistic_Kwiatkowski = 0.4099630996309963, p_value = 0.072860732917674 \n", + "\n", + "Since p_value > 0.05, the null hypothesis cannot be rejected. Therefore, the series is probably trend-stationary.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "## **5.4. Other Tests:**" + ], + "metadata": { + "id": "6sR1d3BHS9Hh" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "### **5.4.1. Mann-Whitney U-Test:**" + ], + "metadata": { + "id": "_beFJ80gTH85" + } + }, + { + "cell_type": "markdown", + "source": [ + "Tests whether the distributions of two independent samples are equal or not.\n", + "\n", + "$H_0$: The distributions of both samples are equal.\n", + "\n", + "$H_1$: The distributions of both samples are not equal.\n", + "\n", + "Assumptions:\n", + "* Observations in each sample are independent and identically distributed (iid).\n", + "* Observations in each sample can be ranked.\n", + "\n", + "$\\\\ $\n", + "\n", + "[Mann-Whitney U Test Doc](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html)" + ], + "metadata": { + "id": "xDJkmODOTOrP" + } + }, + { + "cell_type": "code", + "source": [ + "N = 10\n", + "alpha = 0.05\n", + "data1 = np.random.normal(0, 1, N)\n", + "data2 = np.random.normal(0, 1, N)\n", + "\n", + "Test_statistic, p_value = mannwhitneyu(data1, data2, alternative='two-sided')\n", + "print(f\"Test_statistic_Mann-Whitney = {Test_statistic}, p_value = {p_value}\", \"\\n\")\n", + "\n", + "if p_value < alpha:\n", + "\tprint(f'Since p_value < {alpha}, reject null hypothesis. Therefore, Two data distributions are probably not equal.')\n", + "else:\n", + "\tprint(f'Since p_value > {alpha}, the null hypothesis cannot be rejected. Therefore, Two data distributions are probably equal.')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UJaGUqmfTjr4", + "outputId": "0e6858f5-0e32-4345-a83a-ce2b2ed6b517" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test_statistic_Mann-Whitney = 61.0, p_value = 0.4273553138978077 \n", + "\n", + "Since p_value > 0.05, the null hypothesis cannot be rejected. Therefore, Two data distributions are probably equal.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "### **5.4.2. Wilcoxon Signed-Rank Test:**" + ], + "metadata": { + "id": "uXF1GqteUu0E" + } + }, + { + "cell_type": "markdown", + "source": [ + "Tests whether the distributions of two paired samples are equal or not.\n", + "\n", + "$H_0$: The distributions of both samples are equal.\n", + "\n", + "$H_1$: The distributions of both samples are not equal.\n", + "\n", + "Assumptions:\n", + "* Observations in each sample are independent and identically distributed (iid).\n", + "* Observations in each sample can be ranked.\n", + "* Observations across each sample are paired.\n", + "\n", + "$\\\\ $\n", + "\n", + "[Wilcoxon Signed-Rank Test Doc](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wilcoxon.html)" + ], + "metadata": { + "id": "wQUAp_96Uu0E" + } + }, + { + "cell_type": "code", + "source": [ + "N = 10\n", + "alpha = 0.05\n", + "data1 = np.random.normal(0, 1, N)\n", + "data2 = np.random.normal(0, 1, N)\n", + "\n", + "Test_statistic, p_value = wilcoxon(data1, data2, alternative='two-sided')\n", + "print(f\"Test_statistic_Wilcoxon = {Test_statistic}, p_value = {p_value}\", \"\\n\")\n", + "\n", + "if p_value < alpha:\n", + "\tprint(f'Since p_value < {alpha}, reject null hypothesis. Therefore, Two data distributions are probably not equal.')\n", + "else:\n", + "\tprint(f'Since p_value > {alpha}, the null hypothesis cannot be rejected. Therefore, Two data distributions are probably equal.')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "96500652-9c3f-4794-fb0e-6fdf3302b134", + "id": "_45cKDykUu0F" + }, + "execution_count": 13, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test_statistic_Wilcoxon = 24.0, p_value = 0.76953125 \n", + "\n", + "Since p_value > 0.05, the null hypothesis cannot be rejected. Therefore, Two data distributions are probably equal.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "### **5.4.3. Kruskal-Wallis H Test:**" + ], + "metadata": { + "id": "PpZ12cY4Vv_5" + } + }, + { + "cell_type": "markdown", + "source": [ + "Tests whether the distributions of two or more independent samples are equal or not.\n", + "\n", + "$H_0$: The distributions of all samples are equal.\n", + "\n", + "$H_1$: The distributions of one or more samples are not equal.\n", + "\n", + "Assumptions:\n", + "* Observations in each sample are independent and identically distributed (iid).\n", + "* Observations in each sample can be ranked.\n", + "\n", + "$\\\\ $\n", + "\n", + "[Kruskal-Wallis H Test Doc](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kruskal.html)" + ], + "metadata": { + "id": "yPeW-V0QVv_6" + } + }, + { + "cell_type": "code", + "source": [ + "N = 10\n", + "alpha = 0.05\n", + "data1 = np.random.normal(0, 1, N)\n", + "data2 = np.random.normal(0, 1, N)\n", + "\n", + "Test_statistic, p_value = kruskal(data1, data2)\n", + "print(f\"Test_statistic_Wilcoxon = {Test_statistic}, p_value = {p_value}\", \"\\n\")\n", + "\n", + "if p_value < alpha:\n", + "\tprint(f'Since p_value < {alpha}, reject null hypothesis. Therefore, Two data distributions are probably not equal.')\n", + "else:\n", + "\tprint(f'Since p_value > {alpha}, the null hypothesis cannot be rejected. Therefore, Two data distributions are probably equal.')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "5f856ccd-41f9-420d-997c-0e3507d0ca44", + "id": "gZfSggVCVv_7" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test_statistic_Wilcoxon = 1.462857142857132, p_value = 0.22647606604348455 \n", + "\n", + "Since p_value > 0.05, the null hypothesis cannot be rejected. Therefore, Two data distributions are probably equal.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "### **5.4.4. Friedman Test:**" + ], + "metadata": { + "id": "uCyetBhlW3E3" + } + }, + { + "cell_type": "markdown", + "source": [ + "Tests whether the distributions of two or more paired samples are equal or not.\n", + "\n", + "$H_0$: The distributions of both samples are equal.\n", + "\n", + "$H_1$: The distributions of both samples are not equal.\n", + "\n", + "Assumptions:\n", + "* Observations in each sample are independent and identically distributed (iid).\n", + "* Observations in each sample can be ranked.\n", + "* Observations across each sample are paired.\n", + "\n", + "$\\\\ $\n", + "\n", + "[Friedman Test Doc](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.friedmanchisquare.html)" + ], + "metadata": { + "id": "fTbP_K-AW3E5" + } + }, + { + "cell_type": "code", + "source": [ + "alpha = 0.05\n", + "data1 = [0.873, 2.817, 0.121, -0.945, -0.055, -1.436, 0.360, -1.478, -1.637, -1.869]\n", + "data2 = [1.142, -0.432, -0.938, -0.729, -0.846, -0.157, 0.500, 1.183, -1.075, -0.169]\n", + "data3 = [-0.208, 0.696, 0.928, -1.148, -0.213, 0.229, 0.137, 0.269, -0.870, -1.204]\n", + "\n", + "Test_statistic, p_value = friedmanchisquare(data1, data2, data3)\n", + "print(f\"Test_statistic_Friedman = {Test_statistic}, p_value = {p_value}\", \"\\n\")\n", + "\n", + "if p_value < alpha:\n", + "\tprint(f'Since p_value < {alpha}, reject null hypothesis. Therefore, data distributions are probably not equal.')\n", + "else:\n", + "\tprint(f'Since p_value > {alpha}, the null hypothesis cannot be rejected. Therefore, data distributions are probably equal.')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "outputId": "042a5986-7d2e-4c15-b2f7-5a72f8714294", + "id": "w0if4yFkW3E5" + }, + "execution_count": 15, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test_statistic_Friedman = 0.8000000000000114, p_value = 0.6703200460356356 \n", + "\n", + "Since p_value > 0.05, the null hypothesis cannot be rejected. Therefore, data distributions are probably equal.\n" + ] + } + ] + } + ] +} \ No newline at end of file