# Basic Libraries
import pandas as pd
import numpy as np 

# Visualizations Libraries
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
sns.set()


dataframe = pd.read_csv('water_potability.csv')
df = dataframe.copy()
df.shape

(3276, 10)


# get a sample of 10 data points from the dataframe
df.sample(10)


#describe the entire data frame
df.describe()


#desscribe the data frame by samples that are potable
df.loc[df.Potability==1,].describe()


#confirm the data types of each of the variables in the data frame
df.dtypes

ph                 float64
Hardness           float64
Solids             float64
Chloramines        float64
Sulfate            float64
Conductivity       float64
Organic_carbon     float64
Trihalomethanes    float64
Turbidity          float64
Potability           int64
dtype: object


#assess the number of unique values for each variable in the data frame
df.nunique()

ph                 2785
Hardness           3276
Solids             3276
Chloramines        3276
Sulfate            2495
Conductivity       3276
Organic_carbon     3276
Trihalomethanes    3114
Turbidity          3276
Potability            2
dtype: int64


#plot unique values for features
sns.set_style("white")
plt.figure(figsize = (12,10))
nu = df.nunique().reset_index().iloc[0:9]
nu.columns = ['feature', 'nunique']
ax = sns.barplot(x = 'feature', y = 'nunique', data = nu)
plt.ylabel("Number of unique values", fontsize = 14, weight = "bold")
plt.title("Snapshot of Unique Values across Water Potability Features", fontsize = 16, weight = "bold")
plt.xlabel("Variable", fontsize = 14, weight = "bold")
plt.xticks(rotation = 45, fontsize = 12)
plt.show()


# Calculate number of duplicated rows
print("Duplicated rows:")
duplicates = df[df.duplicated()]
duplicates

Duplicated rows:


# Write a function calculating what percentage of each feature is null.
def na_per(boop):
    na_sums = boop.isna().sum()
    na_per = round(na_sums/len(boop)*100,2)
    print(f"{na_per}% of {boop.name} are missing values(n = {na_sums})")

for i, feature in enumerate(df.columns):
    na_per(df[feature])

14.99% of ph are missing values(n = 491)
0.0% of Hardness are missing values(n = 0)
0.0% of Solids are missing values(n = 0)
0.0% of Chloramines are missing values(n = 0)
23.84% of Sulfate are missing values(n = 781)
0.0% of Conductivity are missing values(n = 0)
0.0% of Organic_carbon are missing values(n = 0)
4.95% of Trihalomethanes are missing values(n = 162)
0.0% of Turbidity are missing values(n = 0)
0.0% of Potability are missing values(n = 0)


x1 = df.loc[df.Potability==0,]
for i, feature in enumerate(x1.columns):
    na_per(x1[feature])

15.72% of ph are missing values(n = 314)
0.0% of Hardness are missing values(n = 0)
0.0% of Solids are missing values(n = 0)
0.0% of Chloramines are missing values(n = 0)
24.42% of Sulfate are missing values(n = 488)
0.0% of Conductivity are missing values(n = 0)
0.0% of Organic_carbon are missing values(n = 0)
5.36% of Trihalomethanes are missing values(n = 107)
0.0% of Turbidity are missing values(n = 0)
0.0% of Potability are missing values(n = 0)


x2 = df.loc[df.Potability==1,]

for i, feature in enumerate(x2.columns):
    na_per(x2[feature])

13.85% of ph are missing values(n = 177)
0.0% of Hardness are missing values(n = 0)
0.0% of Solids are missing values(n = 0)
0.0% of Chloramines are missing values(n = 0)
22.93% of Sulfate are missing values(n = 293)
0.0% of Conductivity are missing values(n = 0)
0.0% of Organic_carbon are missing values(n = 0)
4.3% of Trihalomethanes are missing values(n = 55)
0.0% of Turbidity are missing values(n = 0)
0.0% of Potability are missing values(n = 0)


# Comparison of features with dependent variable (Potabilitiy)
df.groupby('Potability').mean()


# Correlation of features with each other
round(df.corr(),3)


#count the number of samples across potable and non-potable samples
potability_series=df['Potability'].value_counts()
potability_per=pd.Series([(potability_series[0]/len(df)*100),(potability_series[1]/len(df)*100)])
potability_labels = pd.Series(['Not potable','Potable'])
df2=pd.DataFrame({"Potability":potability_labels, 'Count':potability_series, 'Percent':round(potability_per,2)})
df2


#plot the number of samples across potable and non-potable samples
plt.bar(x = df2["Potability"],
        height = df2["Count"],
       color = "midnightblue" )
plt.title("Potability of Samples", size = 16, fontweight = "bold")
plt.ylabel("Number of Samples", size = 13)
plt.yticks(size = 13)
plt.xticks( size = 13)
plt.show()


#adjust figure space
plt.figure(figsize = (15,10), tight_layout = True).text(-0.01, 0.5, 'Number of Samples', va='center', rotation='vertical', 
                                                       fontsize = 18, weight = "bold")
# plot histogram for each of the features 
for i, feature in enumerate(df):
    if feature != 'Potability':
        plt.subplot(3,3,i+1)
        plt.hist(df[feature],
             bins = 30,
             color = "#108A99")
        plt.title(f"Distribution of {feature}", fontsize = 14, weight = "bold")
        plt.xlabel("")
        plt.ylabel("")
        sns.despine()


plt.figure(figsize = (15,10), tight_layout = True).text(-0.01, 0.5, 'Number of Samples', va='center', rotation='vertical', 
                                                       fontsize = 18, weight = "bold")
colors = ["steelblue","firebrick"]
names = ["Potable","Not potable"]
for i, feature in enumerate(df):
    if feature != 'Potability':
        plt.subplot(3,3,i+1)
        x1 = df.loc[df.Potability==0,feature]
        x2 = df.loc[df.Potability==1,feature]
        plt.hist([x2,x1], alpha = 0.8, bins = 30, color = colors, label = names, density = True, stacked = False)
        plt.legend()
        plt.title(f"{feature} by Water Potability", fontsize = 14, weight = "bold")
        plt.xlabel("")
        plt.ylabel("")
        sns.despine()


# fill null values with feature mean
df3=df.copy()
df3['ph'].fillna(df3['ph'].mean(), inplace=True)
df3['Sulfate'].fillna(df3['Sulfate'].mean(), inplace=True)
df3['Trihalomethanes'].fillna(df3['Trihalomethanes'].mean(), inplace=True)


#check for any remaining missing values
for i, feature in enumerate(df3.columns):
    na_per(df3[feature])

0.0% of ph are missing values(n = 0)
0.0% of Hardness are missing values(n = 0)
0.0% of Solids are missing values(n = 0)
0.0% of Chloramines are missing values(n = 0)
0.0% of Sulfate are missing values(n = 0)
0.0% of Conductivity are missing values(n = 0)
0.0% of Organic_carbon are missing values(n = 0)
0.0% of Trihalomethanes are missing values(n = 0)
0.0% of Turbidity are missing values(n = 0)
0.0% of Potability are missing values(n = 0)


plt.figure(figsize = (20,15), tight_layout = True)
y= df3['Potability'].astype(bool)
for i, feature in enumerate(df3.columns):
    if feature != 'Potability':
        plt.subplot(3,3,i+1)
        sns.regplot(x=df3[feature],y=df3['Potability'].astype(bool),data=df3,logistic=True,
        scatter_kws={'color':"steelblue"}, line_kws={'color':"midnightblue"})
        plt.title(f"{feature}", fontsize = 14, weight = "bold")


g=sns.pairplot(df, hue="Potability")
new_labels = ['Non-potable', ' Potable']
for t, l in zip(g._legend.texts, new_labels):
    t.set_text(l)

	ph	Hardness	Solids	Chloramines	Sulfate	Conductivity	Organic_carbon	Trihalomethanes	Turbidity	Potability
646	NaN	221.620058	11954.700730	6.657053	NaN	391.238714	12.961433	NaN	3.282061	0
2873	NaN	231.305127	15629.762335	7.018067	297.748867	406.340425	11.835535	49.679696	4.464244	0
595	8.269945	192.003776	11889.217747	8.377233	345.987354	405.340609	17.395662	85.444985	3.076480	0
1113	8.775486	157.354289	22113.391452	7.931503	304.807845	480.715208	15.692839	82.929345	4.393944	1
2325	5.685216	154.570002	17628.043059	6.664444	390.094731	324.427772	17.352586	67.407300	4.066157	1
1883	7.456251	226.538289	9163.294448	5.726224	NaN	417.125210	10.066076	67.304224	4.279697	0
919	5.606856	211.513097	32423.462685	6.812943	NaN	446.386404	20.926950	52.218979	4.070966	0
26	3.445062	207.926260	33424.768678	8.782147	384.007006	441.785876	13.805902	30.284597	4.184397	0
1117	NaN	178.154855	34383.597639	6.425770	311.526890	570.378765	17.592126	68.992741	4.005893	1
1837	NaN	216.271624	26964.317181	6.383548	418.494244	573.159591	19.892680	50.170522	4.501025	0

	ph	Hardness	Solids	Chloramines	Sulfate	Conductivity	Organic_carbon	Trihalomethanes	Turbidity	Potability
count	2785.000000	3276.000000	3276.000000	3276.000000	2495.000000	3276.000000	3276.000000	3114.000000	3276.000000	3276.000000
mean	7.080795	196.369496	22014.092526	7.122277	333.775777	426.205111	14.284970	66.396293	3.966786	0.390110
std	1.594320	32.879761	8768.570828	1.583085	41.416840	80.824064	3.308162	16.175008	0.780382	0.487849
min	0.000000	47.432000	320.942611	0.352000	129.000000	181.483754	2.200000	0.738000	1.450000	0.000000
25%	6.093092	176.850538	15666.690297	6.127421	307.699498	365.734414	12.065801	55.844536	3.439711	0.000000
50%	7.036752	196.967627	20927.833607	7.130299	333.073546	421.884968	14.218338	66.622485	3.955028	0.000000
75%	8.062066	216.667456	27332.762127	8.114887	359.950170	481.792304	16.557652	77.337473	4.500320	1.000000
max	14.000000	323.124000	61227.196008	13.127000	481.030642	753.342620	28.300000	124.000000	6.739000	1.000000

	ph	Hardness	Solids	Chloramines	Sulfate	Conductivity	Organic_carbon	Trihalomethanes	Turbidity	Potability
count	1101.000000	1278.000000	1278.000000	1278.000000	985.000000	1278.000000	1278.000000	1223.000000	1278.000000	1278.0
mean	7.073783	195.800744	22383.991018	7.169338	332.566990	425.383800	14.160893	66.539684	3.968328	1.0
std	1.448048	35.547041	9101.010208	1.702988	47.692818	82.048446	3.263907	16.327419	0.780842	0.0
min	0.227499	47.432000	728.750830	0.352000	129.000000	201.619737	2.200000	8.175876	1.492207	1.0
25%	6.179312	174.330531	15668.985035	6.094134	300.763772	360.939023	12.033897	56.014249	3.430909	1.0
50%	7.036752	196.632907	21199.386614	7.215163	331.838167	420.712729	14.162809	66.678214	3.958576	1.0
75%	7.933068	218.003420	27973.236446	8.199261	365.941346	484.155911	16.356245	77.380975	4.509569	1.0
max	13.175402	323.124000	56488.672413	13.127000	481.030642	695.369528	23.604298	124.000000	6.494249	1.0

	ph	Hardness	Solids	Chloramines	Sulfate	Conductivity	Organic_carbon	Trihalomethanes	Turbidity
Potability
0	7.085378	196.733292	21777.490788	7.092175	334.56429	426.730454	14.364335	66.303555	3.965800
1	7.073783	195.800744	22383.991018	7.169338	332.56699	425.383800	14.160893	66.539684	3.968328

	ph	Hardness	Solids	Chloramines	Sulfate	Conductivity	Organic_carbon	Trihalomethanes	Turbidity	Potability
ph	1.000	0.082	-0.089	-0.034	0.018	0.019	0.044	0.003	-0.039	-0.004
Hardness	0.082	1.000	-0.047	-0.030	-0.107	-0.024	0.004	-0.013	-0.014	-0.014
Solids	-0.089	-0.047	1.000	-0.070	-0.172	0.014	0.010	-0.009	0.020	0.034
Chloramines	-0.034	-0.030	-0.070	1.000	0.027	-0.020	-0.013	0.017	0.002	0.024
Sulfate	0.018	-0.107	-0.172	0.027	1.000	-0.016	0.031	-0.030	-0.011	-0.024
Conductivity	0.019	-0.024	0.014	-0.020	-0.016	1.000	0.021	0.001	0.006	-0.008
Organic_carbon	0.044	0.004	0.010	-0.013	0.031	0.021	1.000	-0.013	-0.027	-0.030
Trihalomethanes	0.003	-0.013	-0.009	0.017	-0.030	0.001	-0.013	1.000	-0.022	0.007
Turbidity	-0.039	-0.014	0.020	0.002	-0.011	0.006	-0.027	-0.022	1.000	0.002
Potability	-0.004	-0.014	0.034	0.024	-0.024	-0.008	-0.030	0.007	0.002	1.000

Analysis of Water Quality - An Exploratory Data Analysis Project¶

Jessica Hooker¶

January 2023¶

Introduction¶

A Brief Description of the Dataset¶

Libraries & Visualization Parameters¶

Exploring the Structure of Dataset¶

Preliminary Notes¶

Data Quality and Preprocessing¶

Check for Data Duplication¶

Distribution of nulls¶

Feature Visualization¶

Feature Distibution¶

Feature Distribution by Potability of Sample¶

Addressing Missing Values¶

Predicting Water Potability¶

Exploring associations between features in relation to water potability.¶

	Potability	Count	Percent
0	Not potable	1998	60.99
1	Potable	1278	39.01