import matplotlib as mpl
import matplotlib.pyplot as plt 
import matplotlib.dates as md
%matplotlib inline
mpl.rcParams['figure.figsize'] = (20,10)
import seaborn as sns
import pandas as pd
import statsmodels.api as sm


dataframe_raw = pd.read_csv('DF_Raw_Data.csv')
dataframe_stdev = pd.read_csv('DF_Rolling_Stdev.csv')


dataframe_raw.head()


dataframe_raw.describe()


dataframe_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2453 entries, 0 to 2452
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Data Source              2453 non-null   object 
 1   TIMEFRAME (DD/MM/YYYY)   2453 non-null   object 
 2   Volumetric Flow Meter 1  2453 non-null   float64
 3   Volumetric Flow Meter 2  2453 non-null   float64
 4   Pump Speed (RPM)         2453 non-null   int64  
 5   Pump Torque              2453 non-null   int64  
 6   Ambient Temperature      2453 non-null   int64  
 7   Horse Power              2453 non-null   float64
 8   Pump Efficiency          2453 non-null   float64
 9   PUMP FAILURE (1 or 0)    2453 non-null   int64  
dtypes: float64(4), int64(4), object(2)
memory usage: 191.8+ KB


dataframe_stdev.head()


dataframe_stdev.describe()


dataframe_stdev.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2452 entries, 0 to 2451
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Data Source              2452 non-null   object 
 1   TIMEFRAME (DD/MM/YYYY)   2452 non-null   object 
 2   Volumetric Flow Meter 1  2452 non-null   float64
 3   Volumetric Flow Meter 2  2452 non-null   float64
 4   Pump Speed (RPM)         2452 non-null   float64
 5   Pump Torque              2452 non-null   float64
 6   Ambient Temperature      2452 non-null   float64
 7   Horse Power              2452 non-null   float64
 8   Pump Efficiency          2452 non-null   float64
 9   PUMP FAILURE (1 or 0)    2452 non-null   int64  
dtypes: float64(7), int64(1), object(2)
memory usage: 191.7+ KB


dataframe_raw.plot.box(title='Boxplot - Raw Data')
dataframe_stdev.plot.box(title='Boxplot - STDEV Data')

<AxesSubplot:title={'center':'Boxplot - STDEV Data'}>


dataframe_raw.plot.line(title='Lineplot - Raw Data')
dataframe_stdev.plot.line(title='Lineplot - STDEV Data')

<AxesSubplot:title={'center':'Lineplot - STDEV Data'}>

Text(0.5, 1.0, 'Boxplot - Standard Deviation Data')


dataframe_raw[dataframe_raw['PUMP FAILURE (1 or 0)'] == 0].plot.box(title='Boxplot - PUMP FAILURE = 0')
dataframe_raw[dataframe_raw['PUMP FAILURE (1 or 0)'] == 1].plot.box(title='Boxplot - PUMP FAILURE = 1')

<AxesSubplot:title={'center':'Boxplot - PUMP FAILURE = 1'}>


Q1 = dataframe_raw.quantile(0.25)
Q3 = dataframe_raw.quantile(0.75)
IQR = Q3 - Q1
IQR

Volumetric Flow Meter 1     2.09
Volumetric Flow Meter 2     2.13
Pump Speed (RPM)           12.00
Pump Torque                24.00
Ambient Temperature         5.00
Horse Power                 0.56
Pump Efficiency             3.91
PUMP FAILURE (1 or 0)       0.00
dtype: float64


dataframe_raw.count()

Data Source                2453
TIMEFRAME (DD/MM/YYYY)     2453
Volumetric Flow Meter 1    2453
Volumetric Flow Meter 2    2453
Pump Speed (RPM)           2453
Pump Torque                2453
Ambient Temperature        2453
Horse Power                2453
Pump Efficiency            2453
PUMP FAILURE (1 or 0)      2453
dtype: int64


outliers = (dataframe_raw < (Q1 - 1.5 * IQR)) |(dataframe_raw > (Q3 + 1.5 * IQR))
dataframe_raw[outliers].count()

Data Source                 0
TIMEFRAME (DD/MM/YYYY)      0
Volumetric Flow Meter 1    44
Volumetric Flow Meter 2    45
Pump Speed (RPM)           25
Pump Torque                21
Ambient Temperature        24
Horse Power                83
Pump Efficiency            43
PUMP FAILURE (1 or 0)      52
dtype: int64


print(dataframe_raw[outliers].count() / dataframe_raw.count() * 100)

Data Source                0.000000
TIMEFRAME (DD/MM/YYYY)     0.000000
Volumetric Flow Meter 1    1.793722
Volumetric Flow Meter 2    1.834488
Pump Speed (RPM)           1.019160
Pump Torque                0.856095
Ambient Temperature        0.978394
Horse Power                3.383612
Pump Efficiency            1.752956
PUMP FAILURE (1 or 0)      2.119853
dtype: float64


df_new = dataframe_raw[outliers]
df_new


df_new[df_new['PUMP FAILURE (1 or 0)'] == 0].plot.box(title='Boxplot - PUMP FAILURE = 0')
df_new[df_new['PUMP FAILURE (1 or 0)'] == 1].plot.box(title='Boxplot - PUMP FAILURE = 1')

<AxesSubplot:title={'center':'Boxplot - PUMP FAILURE = 1'}>


list=[2,3,4,5,6,7,8]
for i in list:
    fig, ax1 = plt.subplots()
    ax1.plot(dataframe_raw.iloc[:,i])
    ax1.tick_params(axis='y')
    ax2 = ax1.twinx()
    ax2.plot(dataframe_raw.iloc[:,-1], color = 'red')
    fig.suptitle('This is for the attribute ' + dataframe_raw.columns[i] , fontweight ="bold", fontsize= 24)
    fig.tight_layout()
    plt.show()


df_RTP = dataframe_stdev.set_index('TIMEFRAME (DD/MM/YYYY)')
df_rtp_filtered = df_RTP[(df_RTP.index >= '10/12/2014 13:30') & (df_RTP.index <= '10/12/2014 14:30')]
list=[1,2,3,4,5,6,7]
for i in list:
    fig, ax1 = plt.subplots()
    ax1.plot(df_rtp_filtered.iloc[:,i])
    ax1.tick_params(axis='y')
    ax2 = ax1.twinx()
    ax2.plot(df_rtp_filtered.iloc[:,-1], color = 'red')
    fig.suptitle('This is for the attribute ' + df_rtp_filtered.columns[i] , fontweight ="bold", fontsize= 24)
    fig.tight_layout()
    plt.show()

	Data Source	TIMEFRAME (DD/MM/YYYY)	Volumetric Flow Meter 1	Volumetric Flow Meter 2	Pump Speed (RPM)	Pump Torque	Ambient Temperature	Horse Power	Pump Efficiency
0	Raw	9/12/2014 0:00	41.30	41.16	98	207	54	3.86	74.84
1	Raw	9/12/2014 0:01	42.40	41.39	92	212	46	3.71	75.25
2	Raw	9/12/2014 0:02	41.43	41.15	80	207	55	3.15	74.82
3	Raw	9/12/2014 0:03	42.21	40.93	83	190	49	3.00	74.42
4	Raw	9/12/2014 0:04	40.51	43.32	90	195	50	3.34	78.76

	Volumetric Flow Meter 1	Volumetric Flow Meter 2	Pump Speed (RPM)	Pump Torque	Ambient Temperature	Horse Power	Pump Efficiency	PUMP FAILURE (1 or 0)
count	2453.000000	2453.000000	2453.000000	2453.000000	2453.000000	2453.000000	2453.000000	2453.000000
mean	41.802629	41.796702	90.796576	202.851610	50.226661	3.540897	76.015149	0.021199
std	3.656576	3.654873	10.217885	22.683977	5.298203	0.579055	6.651633	0.144075
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	41.050000	41.000000	85.000000	191.000000	48.000000	3.220000	74.560000	0.000000
50%	42.100000	42.140000	91.000000	203.000000	51.000000	3.480000	76.620000	0.000000
75%	43.140000	43.130000	97.000000	215.000000	53.000000	3.780000	78.470000	0.000000
max	45.900000	45.840000	124.000000	264.000000	65.000000	7.560000	83.450000	1.000000

	Data Source	TIMEFRAME (DD/MM/YYYY)	Volumetric Flow Meter 1	Volumetric Flow Meter 2	Pump Speed (RPM)	Pump Torque	Ambient Temperature	Horse Power	Pump Efficiency
0	Rolling Stdev (30 Minute)	9/12/2014 0:00	1.04	0.96	5.54	11.70	3.40	0.32	1.74
1	Rolling Stdev (30 Minute)	9/12/2014 0:01	1.06	1.01	5.49	11.73	3.36	0.31	1.83
2	Rolling Stdev (30 Minute)	9/12/2014 0:02	1.06	1.03	5.62	11.94	3.40	0.31	1.87
3	Rolling Stdev (30 Minute)	9/12/2014 0:03	1.06	1.05	5.61	12.10	3.30	0.31	1.90
4	Rolling Stdev (30 Minute)	9/12/2014 0:04	1.07	1.03	5.61	12.31	3.36	0.30	1.88

	Volumetric Flow Meter 1	Volumetric Flow Meter 2	Pump Speed (RPM)	Pump Torque	Ambient Temperature	Horse Power	Pump Efficiency	PUMP FAILURE (1 or 0)
count	2452.000000	2452.000000	2452.000000	2452.000000	2452.000000	2452.00000	2452.000000	2452.000000
mean	1.485126	1.497361	6.648308	13.945338	3.436370	0.37060	2.725232	0.021207
std	2.294950	2.282053	5.722897	12.394302	3.043042	0.29979	4.186723	0.144104
min	0.380000	0.640000	0.580000	5.000000	0.900000	0.11000	1.170000	0.000000
25%	1.070000	1.080000	5.520000	11.210000	2.920000	0.28000	1.960000	0.000000
50%	1.160000	1.170000	5.990000	12.180000	3.160000	0.32000	2.120000	0.000000
75%	1.230000	1.260000	6.460000	13.110000	3.370000	0.36000	2.270000	0.000000
max	21.390000	21.530000	59.310000	124.710000	30.650000	3.32000	39.150000	1.000000

	Data Source	TIMEFRAME (DD/MM/YYYY)	Volumetric Flow Meter 1	Volumetric Flow Meter 2	Pump Speed (RPM)	Pump Torque	Ambient Temperature	Horse Power	Pump Efficiency	PUMP FAILURE (1 or 0)
0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...
2448	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2449	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2450	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2451	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2452	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

Welcome to the Southern Water Corp Python Case Study!¶

Let's get started!¶

Part I: Descriptive Statistics¶

Step 1: Import Libraries¶

Place your code here¶

Step 2: Descriptive Statistics¶

Load Data¶

Raw Data¶

Standard Deviation data¶

Step 3: Create a Boxplot¶

Please put your code here¶

We've included an example of what your Box Plot should look like once you've plotted this using the dataframe_raw dataset¶

What have you observed from the boxplot and line plots?¶

You would probably note that it might seem that some variables, due to their range and size of values, dwarfs some of the other variables which makes the variation difficult to see.¶

More importantly, the dataset we do have contains Pump Failure Data where it has failed (i.e. Pump Failure = 0) as well as when it is operating normally. We should separate this data accordingly to more effectively visualise the information.¶

Step 4: Filtered Dataframes with Box Plots¶

Please put your code here¶

What have you noticed when you compared the dataset in this manner?¶

From analysing the boxplots, you might notice that there seem to be a number of outliers. We might want to see if we can actively remove this with Python.¶

Step 5: Create Quartiles¶

Please put your code here¶

Step 6: Identify Outliers¶

Please put your code here¶

Having removed the outliers from the dataset - do we think this is a good option? Why or why not?¶

Step 7: Create a Boxplot without Outliers¶

Please put your code here¶

Based on the boxplots you've created, you've likely come to the conclusion that, for this case study, you actually shouldn't remove the outliers, as you are attempting to understand the Pump Failure Behavior and the portion of data you need is actually stored WITHIN the Outliers.¶

This is exactly why you should never remove Outliers without Subject Matter Expertise input. Otherwise valuable information may be discarded.¶

Step 8: Plot and Examine Each Column¶

for variable in listOfVariables:¶

Using the syntax provided, loop through the dataframe_raw dataset, plotting every variable individually, against the Pump Failure to better identify trends.¶

Note: Please ensure that the dataframe you are plotting contains all the outliers and that the Pump Failure Behaviour includes both the 0 and 1 State.¶

Please put your code here¶

What do you notice when looking at the data in this way? Do any particular trends emerge?¶

Hint: Remember to make use of the Dual-Axis plot trick you learned in the previous exercise!¶

Step 9: Create a Plot for Pump Failures Over a Rolling Time Period¶

Please put your code here¶

The output from your code should display image(s) like the one shown below¶

Part II: Inferential Statistical Analysis¶

Step 10: Create a Heatmap¶

Please put your code here¶

We've included an example of what the output may look like below¶

Step 11: Create a Barplot of Correlated Features¶

Please put your code here¶

Step 12: Create a Rolling Standard Deviation Heatmap¶

Please put your code here¶

Creating a Multivariate Regression Model¶

Step 13: Use OLS Regression¶

Please put your code here¶

Which linear regression model seems to be a better fit? Why do you think this is the case?¶

Step 14: Validate Predictions¶

Please put your code here¶

Great job! Being able to complete this case study means that you're now proficient with the fundamentals of data analysis in Python!¶