import pandas as pd
import numpy as np
import seaborn as sns
import missingno as msno
import scipy.stats as stats
from pandas import DataFrame
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt


churn = pd.read_csv('C:/Users/GrndS/WGU_MSDA/D206/churn_raw_data.csv')


churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 52 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            10000 non-null  int64  
 1   CaseOrder             10000 non-null  int64  
 2   Customer_id           10000 non-null  object 
 3   Interaction           10000 non-null  object 
 4   City                  10000 non-null  object 
 5   State                 10000 non-null  object 
 6   County                10000 non-null  object 
 7   Zip                   10000 non-null  int64  
 8   Lat                   10000 non-null  float64
 9   Lng                   10000 non-null  float64
 10  Population            10000 non-null  int64  
 11  Area                  10000 non-null  object 
 12  Timezone              10000 non-null  object 
 13  Job                   10000 non-null  object 
 14  Children              7505 non-null   float64
 15  Age                   7525 non-null   float64
 16  Education             10000 non-null  object 
 17  Employment            10000 non-null  object 
 18  Income                7510 non-null   float64
 19  Marital               10000 non-null  object 
 20  Gender                10000 non-null  object 
 21  Churn                 10000 non-null  object 
 22  Outage_sec_perweek    10000 non-null  float64
 23  Email                 10000 non-null  int64  
 24  Contacts              10000 non-null  int64  
 25  Yearly_equip_failure  10000 non-null  int64  
 26  Techie                7523 non-null   object 
 27  Contract              10000 non-null  object 
 28  Port_modem            10000 non-null  object 
 29  Tablet                10000 non-null  object 
 30  InternetService       10000 non-null  object 
 31  Phone                 8974 non-null   object 
 32  Multiple              10000 non-null  object 
 33  OnlineSecurity        10000 non-null  object 
 34  OnlineBackup          10000 non-null  object 
 35  DeviceProtection      10000 non-null  object 
 36  TechSupport           9009 non-null   object 
 37  StreamingTV           10000 non-null  object 
 38  StreamingMovies       10000 non-null  object 
 39  PaperlessBilling      10000 non-null  object 
 40  PaymentMethod         10000 non-null  object 
 41  Tenure                9069 non-null   float64
 42  MonthlyCharge         10000 non-null  float64
 43  Bandwidth_GB_Year     8979 non-null   float64
 44  item1                 10000 non-null  int64  
 45  item2                 10000 non-null  int64  
 46  item3                 10000 non-null  int64  
 47  item4                 10000 non-null  int64  
 48  item5                 10000 non-null  int64  
 49  item6                 10000 non-null  int64  
 50  item7                 10000 non-null  int64  
 51  item8                 10000 non-null  int64  
dtypes: float64(9), int64(15), object(28)
memory usage: 4.0+ MB


churn.duplicated().value_counts()

False    10000
dtype: int64


churn.drop('Unnamed: 0', axis='columns', inplace = True)


churn.head(5)


churn.set_index('CaseOrder', inplace = True)


churn.head(5)


churn.rename(columns={'item1':'timely_response','item2':'timely_fixes','item3':'timely_replacements','item4':'reliability','item5':'options',
                      'item6':'respectful_response','item7':'courteous_exchange','item8':'evidence_of_active_listening'}, inplace=True)


churn.columns

Index(['Customer_id', 'Interaction', 'City', 'State', 'County', 'Zip', 'Lat',
       'Lng', 'Population', 'Area', 'Timezone', 'Job', 'Children', 'Age',
       'Education', 'Employment', 'Income', 'Marital', 'Gender', 'Churn',
       'Outage_sec_perweek', 'Email', 'Contacts', 'Yearly_equip_failure',
       'Techie', 'Contract', 'Port_modem', 'Tablet', 'InternetService',
       'Phone', 'Multiple', 'OnlineSecurity', 'OnlineBackup',
       'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
       'PaperlessBilling', 'PaymentMethod', 'Tenure', 'MonthlyCharge',
       'Bandwidth_GB_Year', 'timely_response', 'timely_fixes',
       'timely_replacements', 'reliability', 'options', 'respectful_response',
       'courteous_exchange', 'evidence_of_active_listening'],
      dtype='object')


pd.set_option('display.max_rows',500)
#(Pandas.set_option)


churn.isnull().sum()

Customer_id                        0
Interaction                        0
City                               0
State                              0
County                             0
Zip                                0
Lat                                0
Lng                                0
Population                         0
Area                               0
Timezone                           0
Job                                0
Children                        2495
Age                             2475
Education                          0
Employment                         0
Income                          2490
Marital                            0
Gender                             0
Churn                              0
Outage_sec_perweek                 0
Email                              0
Contacts                           0
Yearly_equip_failure               0
Techie                          2477
Contract                           0
Port_modem                         0
Tablet                             0
InternetService                    0
Phone                           1026
Multiple                           0
OnlineSecurity                     0
OnlineBackup                       0
DeviceProtection                   0
TechSupport                      991
StreamingTV                        0
StreamingMovies                    0
PaperlessBilling                   0
PaymentMethod                      0
Tenure                           931
MonthlyCharge                      0
Bandwidth_GB_Year               1021
timely_response                    0
timely_fixes                       0
timely_replacements                0
reliability                        0
options                            0
respectful_response                0
courteous_exchange                 0
evidence_of_active_listening       0
dtype: int64


msno.matrix(churn, fontsize = 12, labels=True)
plt.title('Missing data matrix')
plt.show()


churn['Children'].fillna(0, inplace = True)
churn['Techie'].fillna('No', inplace = True)
churn['Phone'].fillna('No', inplace = True)
churn['TechSupport'].fillna('No', inplace = True)


churn.hist('Age')

array([[<AxesSubplot:title={'center':'Age'}>]], dtype=object)


imputer = IterativeImputer()


churn_iterative_age = churn.copy(deep=True)


churn_iterative_age['Age'] = imputer.fit_transform(churn_iterative_age[['Age']])


churn_iterative_age.hist('Age')
#(Kapoor, 2020)

array([[<AxesSubplot:title={'center':'Age'}>]], dtype=object)


imputer_knn = KNNImputer()


churn_KNN_age = churn.copy(deep=True)


imputer_knn_age = KNNImputer(n_neighbors=2)


churn_KNN_age['Age'] = imputer_knn.fit_transform(churn_KNN_age[['Age']])


churn_KNN_age.hist('Age')
#(Kapoor, 2020)

array([[<AxesSubplot:title={'center':'Age'}>]], dtype=object)


churn['Age'].fillna(churn['Age'].mean(), inplace=True)


churn.hist('Age')

array([[<AxesSubplot:title={'center':'Age'}>]], dtype=object)


churn.hist('Income')

array([[<AxesSubplot:title={'center':'Income'}>]], dtype=object)


churn['Income'].fillna(churn['Income'].median(), inplace=True)


churn.hist('Income')

array([[<AxesSubplot:title={'center':'Income'}>]], dtype=object)


churn.hist('Tenure')

array([[<AxesSubplot:title={'center':'Tenure'}>]], dtype=object)


churn['Tenure'].fillna(churn['Tenure'].median(), inplace=True)


churn.hist('Tenure')

array([[<AxesSubplot:title={'center':'Tenure'}>]], dtype=object)


churn.hist('Bandwidth_GB_Year')

array([[<AxesSubplot:title={'center':'Bandwidth_GB_Year'}>]], dtype=object)


churn['Bandwidth_GB_Year'].fillna(churn['Bandwidth_GB_Year'].median(), inplace=True)


churn.hist('Bandwidth_GB_Year')

array([[<AxesSubplot:title={'center':'Bandwidth_GB_Year'}>]], dtype=object)


churn.isna().sum()

Customer_id                     0
Interaction                     0
City                            0
State                           0
County                          0
Zip                             0
Lat                             0
Lng                             0
Population                      0
Area                            0
Timezone                        0
Job                             0
Children                        0
Age                             0
Education                       0
Employment                      0
Income                          0
Marital                         0
Gender                          0
Churn                           0
Outage_sec_perweek              0
Email                           0
Contacts                        0
Yearly_equip_failure            0
Techie                          0
Contract                        0
Port_modem                      0
Tablet                          0
InternetService                 0
Phone                           0
Multiple                        0
OnlineSecurity                  0
OnlineBackup                    0
DeviceProtection                0
TechSupport                     0
StreamingTV                     0
StreamingMovies                 0
PaperlessBilling                0
PaymentMethod                   0
Tenure                          0
MonthlyCharge                   0
Bandwidth_GB_Year               0
timely_response                 0
timely_fixes                    0
timely_replacements             0
reliability                     0
options                         0
respectful_response             0
courteous_exchange              0
evidence_of_active_listening    0
dtype: int64


churn.Timezone.unique()

array(['America/Sitka', 'America/Detroit', 'America/Los_Angeles',
       'America/Chicago', 'America/New_York', 'America/Puerto_Rico',
       'America/Denver', 'America/Menominee', 'America/Phoenix',
       'America/Indiana/Indianapolis', 'America/Boise',
       'America/Kentucky/Louisville', 'Pacific/Honolulu',
       'America/Indiana/Petersburg', 'America/Nome', 'America/Anchorage',
       'America/Indiana/Knox', 'America/Juneau', 'America/Toronto',
       'America/Indiana/Winamac', 'America/Indiana/Vincennes',
       'America/North_Dakota/New_Salem', 'America/Indiana/Tell_City',
       'America/Indiana/Marengo', 'America/Ojinaga'], dtype=object)


churn['Timezone_numeric'] = churn['Timezone']


dict_timezone = {'Timezone_numeric':{'America/Sitka':-9, 'America/Detroit':-5, 'America/Los_Angeles':-8,
       'America/Chicago':-6, 'America/New_York':-5, 'America/Puerto_Rico':-4,
       'America/Denver':-7, 'America/Menominee':-6, 'America/Phoenix':-7,
       'America/Indiana/Indianapolis':-5, 'America/Boise':-7,
       'America/Kentucky/Louisville':-5, 'Pacific/Honolulu':-10,
       'America/Indiana/Petersburg':-5, 'America/Nome':-9, 'America/Anchorage':-9,
       'America/Indiana/Knox':-6, 'America/Juneau':-9, 'America/Toronto':-5,
       'America/Indiana/Winamac':-5, 'America/Indiana/Vincennes':-5,
       'America/North_Dakota/New_Salem':-6, 'America/Indiana/Tell_City':-6,
       'America/Indiana/Marengo':-5, 'America/Ojinaga':-7}}


churn.replace(dict_timezone, inplace=True)


churn.Timezone_numeric.unique()

array([ -9,  -5,  -8,  -6,  -4,  -7, -10], dtype=int64)


churn.Education.unique()

array(["Master's Degree", 'Regular High School Diploma',
       'Doctorate Degree', 'No Schooling Completed', "Associate's Degree",
       "Bachelor's Degree", 'Some College, Less than 1 Year',
       'GED or Alternative Credential',
       'Some College, 1 or More Years, No Degree',
       '9th Grade to 12th Grade, No Diploma',
       'Nursery School to 8th Grade', 'Professional School Degree'],
      dtype=object)


churn['Education_numeric'] = churn['Education']


dict_Education = {'Education_numeric':{
    'No Schooling Completed':0,
    'Nursery School to 8th Grade':8,
    '9th Grade to 12th Grade, No Diploma':11,
    'Regular High School Diploma':12,
    'GED or Alternative Credential':12,
    'Some College, Less than 1 Year':13,
    'Some College, 1 or More Years, No Degree':14,
    "Associate's Degree":15,
    "Bachelor's Degree":16, 
    "Master's Degree":18, 
    'Professional School Degree':20,
    'Doctorate Degree':24
}}


churn.replace(dict_Education, inplace=True)


churn.Education_numeric.unique()

array([18, 12, 24,  0, 15, 16, 13, 14, 11,  8, 20], dtype=int64)


churn.Churn.unique()

array(['No', 'Yes'], dtype=object)


churn['churn_numeric'] = churn['Churn']


dict_churn = {'churn_numeric':{'No':0,'Yes':1}}


churn.replace(dict_churn, inplace=True)


churn.churn_numeric.unique()

array([0, 1], dtype=int64)


churn.Techie.unique()

array(['No', 'Yes'], dtype=object)


churn['techie_numeric'] = churn['Techie']


dict_techie = {'techie_numeric':{'No':0,'Yes':1}}


churn.replace(dict_techie,inplace=True)


churn.techie_numeric.unique()

array([0, 1], dtype=int64)


churn.Port_modem.unique()

array(['Yes', 'No'], dtype=object)


churn['port_modem_numeric'] = churn['Port_modem']


dict_port = {'port_modem_numeric':{'No':0,'Yes':1}}


churn.replace(dict_port,inplace=True)


churn.port_modem_numeric.unique()

array([1, 0], dtype=int64)


churn.Tablet.unique()

array(['Yes', 'No'], dtype=object)


churn['tablet_numeric'] = churn['Tablet']


dict_tablet = {'tablet_numeric':{'Yes':1,'No':0}}


churn.replace(dict_tablet, inplace = True)


churn.tablet_numeric.unique()

array([1, 0], dtype=int64)


churn.Phone.unique()

array(['Yes', 'No'], dtype=object)


churn['phone_numeric'] = churn['Phone']


dict_phone = {'phone_numeric':{'Yes':1,'No':0}}


churn.replace(dict_phone, inplace = True)


churn.phone_numeric.unique()

array([1, 0], dtype=int64)


churn.Multiple.unique()

array(['No', 'Yes'], dtype=object)


churn['multiple_numeric'] = churn['Multiple']


dict_multiple = {'multiple_numeric':{'Yes':1,'No':0}}


churn.replace(dict_multiple, inplace = True)


churn.multiple_numeric.unique()

array([0, 1], dtype=int64)


churn.OnlineSecurity.unique()

array(['Yes', 'No'], dtype=object)


churn['online_security_numeric'] = churn['OnlineSecurity']


dict_security = {'online_security_numeric':{'Yes':1,'No':0}}


churn.replace(dict_security, inplace = True)


churn.online_security_numeric.unique()

array([1, 0], dtype=int64)


churn.OnlineBackup.unique()

array(['Yes', 'No'], dtype=object)


churn['online_backup_numeric'] = churn['OnlineBackup']


dict_backup = {'online_backup_numeric':{'Yes':1,'No':0}}


churn.replace(dict_backup, inplace = True)


churn.online_backup_numeric.unique()

array([1, 0], dtype=int64)


churn.DeviceProtection.unique()

array(['No', 'Yes'], dtype=object)


churn['device_protection_numeric'] = churn['DeviceProtection']


dict_protection = {'device_protection_numeric':{'Yes':1,'No':0}}


churn.replace(dict_protection, inplace = True)


churn.device_protection_numeric.unique()

array([0, 1], dtype=int64)


churn.TechSupport.unique()

array(['No', 'Yes'], dtype=object)


churn['tech_support_numeric'] = churn['TechSupport']


dict_support = {'tech_support_numeric':{'No':0,'Yes':1}}


churn.replace(dict_support, inplace = True)


churn.tech_support_numeric.unique()

array([0, 1], dtype=int64)


churn.StreamingTV.unique()

array(['No', 'Yes'], dtype=object)


churn['streaming_tv_numeric'] = churn['StreamingTV']


dict_tv = {'streaming_tv_numeric':{'No':0,'Yes':1}}


churn.replace(dict_tv, inplace = True)


churn.streaming_tv_numeric.unique()

array([0, 1], dtype=int64)


churn.StreamingMovies.unique()

array(['Yes', 'No'], dtype=object)


churn['streaming_movies_numeric'] = churn['StreamingMovies']


dict_movies = {'streaming_movies_numeric':{'Yes':1,'No':0}}


churn.replace(dict_movies, inplace = True)


churn.streaming_movies_numeric.unique()

array([1, 0], dtype=int64)


churn.PaperlessBilling.unique()

array(['Yes', 'No'], dtype=object)


churn['paperless_billing_numeric'] = churn['PaperlessBilling']


dict_billing = {'paperless_billing_numeric':{'Yes':1,'No':0}}


churn.replace(dict_billing, inplace = True)


churn.paperless_billing_numeric.unique()

array([1, 0], dtype=int64)


churn['lat_z'] = stats.zscore(churn['Lat'])


churn.hist('lat_z')

array([[<AxesSubplot:title={'center':'lat_z'}>]], dtype=object)


lat_outliers = churn.query('lat_z > 3 | lat_z < -3')


lat_outliers.lat_z.count()

151


lat_outliers.sort_values(['lat_z'], ascending=False).head()


churn['lng_z'] = stats.zscore(churn['Lng'])


churn.hist('lng_z')

array([[<AxesSubplot:title={'center':'lng_z'}>]], dtype=object)


lng_outliers=churn.query('lng_z > 3 | lng_z < -3')


lng_outliers.lng_z.count()

102


lng_outliers.sort_values(['lng_z'], ascending=False).head()


churn['population_z'] = stats.zscore(churn['Population'])


churn.hist('population_z')

array([[<AxesSubplot:title={'center':'population_z'}>]], dtype=object)


boxplot=sns.boxplot(x='population_z', data=churn)


pop_outliers=churn.query('population_z > 3 | population_z < -3')


pop_outliers.population_z.count()

219


pop_outliers.sort_values(['population_z'], ascending=False).head()


churn['children_z'] = stats.zscore(churn['Children'])


churn.hist('children_z')

array([[<AxesSubplot:title={'center':'children_z'}>]], dtype=object)


boxplot=sns.boxplot(x='children_z', data=churn)


children_outliers = churn.query('children_z > 3 | children_z < -3')


children_outliers.children_z.count()

302


children_outliers[['Age','Children','children_z']].sort_values(['Children'], ascending=False).head(10)


churn.hist('Children')

array([[<AxesSubplot:title={'center':'Children'}>]], dtype=object)


children_median=churn['Children'].median()


mask = churn['Customer_id'].isin(children_outliers['Customer_id'])


churn.loc[mask, 'Children'] = children_median


churn.hist('Children', bins=7)

array([[<AxesSubplot:title={'center':'Children'}>]], dtype=object)


churn[['Age','Children','children_z']].sort_values(['Children'], ascending=False).head(10)


churn['age_z'] = stats.zscore(churn['Age'])


churn.hist('age_z')

array([[<AxesSubplot:title={'center':'age_z'}>]], dtype=object)


boxplot=sns.boxplot(x='age_z', data=churn)


churn['income_z'] = stats.zscore(churn['Income'])


churn.hist('income_z')

array([[<AxesSubplot:title={'center':'income_z'}>]], dtype=object)


boxplot=sns.boxplot(x='income_z', data=churn)


income_outliers = churn.query('income_z > 3 | income_z < -3')


income_outliers.income_z.count()

193


churn[['Age','Income','income_z']].sort_values(['income_z'],ascending=False)


churn['outage_sec_perweek_z'] = stats.zscore(churn['Outage_sec_perweek'])


churn.hist('outage_sec_perweek_z')

array([[<AxesSubplot:title={'center':'outage_sec_perweek_z'}>]],
      dtype=object)


boxplot = sns.boxplot(x='outage_sec_perweek_z', data=churn)


outage_outliers = churn.query('outage_sec_perweek_z > 3 | outage_sec_perweek_z < -3')


outage_outliers.outage_sec_perweek_z.count()

491


churn[['Outage_sec_perweek','outage_sec_perweek_z']].sort_values(['outage_sec_perweek_z'], ascending=False).tail(20)


churn.drop(churn[churn['Outage_sec_perweek'] < 0].index, inplace = True)


churn[['Age','Bandwidth_GB_Year','Outage_sec_perweek','outage_sec_perweek_z']].sort_values(['outage_sec_perweek_z'], ascending=False).tail(10)


churn['email_z'] = stats.zscore(churn['Email'])


churn.hist('email_z')

array([[<AxesSubplot:title={'center':'email_z'}>]], dtype=object)


boxplot = sns.boxplot(x='email_z', data=churn)


email_outliers = churn.query('email_z > 3 | email_z < -3')


email_outliers.email_z.count()

12


churn.hist('Email')

array([[<AxesSubplot:title={'center':'Email'}>]], dtype=object)


churn['contacts_z'] = stats.zscore(churn['Contacts'])


churn.hist('contacts_z')

array([[<AxesSubplot:title={'center':'contacts_z'}>]], dtype=object)


contacts_outliers = churn.query('contacts_z > 3 | contacts_z < -3')


contacts_outliers[['Contacts','contacts_z']].sort_values(['contacts_z'], ascending=True).head()


contacts_outliers.contacts_z.count()

165


churn['Contacts'].min()

0


churn['Contacts'].max()

7


churn['yearly_equip_failure_z'] = stats.zscore(churn['Yearly_equip_failure'])


pd.unique(churn['yearly_equip_failure_z'])

array([ 0.94619579, -0.62591827,  4.09042393,  2.51830986,  5.662538  ,
        8.80676613])


churn.hist('yearly_equip_failure_z',bins=6)

array([[<AxesSubplot:title={'center':'yearly_equip_failure_z'}>]],
      dtype=object)


yearly_equip_failure_outliers = churn.query('yearly_equip_failure_z > 3 | yearly_equip_failure_z < -3')


yearly_equip_failure_outliers[['Yearly_equip_failure', 'yearly_equip_failure_z']].sort_values(['yearly_equip_failure_z'], ascending=True)


yearly_equip_failure_outliers.yearly_equip_failure_z.count()

94


churn.hist('Yearly_equip_failure',bins=6)

array([[<AxesSubplot:title={'center':'Yearly_equip_failure'}>]],
      dtype=object)


churn['tenure_z'] = stats.zscore(churn['Tenure'])


churn[['Tenure','tenure_z']].sort_values(['tenure_z'], ascending=False)


churn.hist('tenure_z')

array([[<AxesSubplot:title={'center':'tenure_z'}>]], dtype=object)


churn['monthly_charge_z'] = stats.zscore(churn['MonthlyCharge'])


churn.hist('monthly_charge_z')

array([[<AxesSubplot:title={'center':'monthly_charge_z'}>]], dtype=object)


churn[['Income','Port_modem','InternetService','Phone','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies',
       'Bandwidth_GB_Year','MonthlyCharge','monthly_charge_z']].sort_values(['monthly_charge_z'], ascending=False)


churn['bandwidth_gb_year_z'] = stats.zscore(churn['Bandwidth_GB_Year'])


churn.hist('bandwidth_gb_year_z')

array([[<AxesSubplot:title={'center':'bandwidth_gb_year_z'}>]],
      dtype=object)


churn[['Bandwidth_GB_Year','bandwidth_gb_year_z']].sort_values(['bandwidth_gb_year_z'], ascending=False)


churn.to_csv(r'C:/Users/GrndS/WGU_MSDA/D206/clean_churn.csv')


clean_churn = pd.read_csv('C:/Users/GrndS/WGU_MSDA/D206/clean_churn.csv')


churn_pca = clean_churn[['Population','Children','Age','Income','Outage_sec_perweek','Email','Contacts','Yearly_equip_failure','Tenure','MonthlyCharge','Bandwidth_GB_Year']]


churn_pca_normalized = (churn_pca - churn_pca.mean())/churn_pca.std()


pca = PCA(n_components = churn_pca.shape[1])


pca.fit(churn_pca_normalized)

PCA(n_components=11)


churn_pca2 = pd.DataFrame(pca.transform(churn_pca_normalized), columns = ['PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10','PC11'])


loadings = pd.DataFrame(pca.components_.T,
                        columns=['PC1','PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10','PC11'],
                        index=churn_pca_normalized.columns)
loadings


cov_matrix = np.dot(churn_pca_normalized.T, churn_pca_normalized) / churn_pca.shape[0]


eigenvalues = [np.dot(eigenvector.T, np.dot(cov_matrix, eigenvector)) for eigenvector in pca.components_]


plt.plot(eigenvalues)
plt.xlabel('number of components')
plt.ylabel('eigenvalues')
plt.axhline(y=1, color='red')
plt.show()


eigenvalues

[1.8968692038225257,
 1.1323992232844229,
 1.0429187918179732,
 1.0175290037575966,
 1.0038988929742423,
 0.9969099412607694,
 0.9914462665331434,
 0.9786805876690792,
 0.9702988764979108,
 0.864725980959536,
 0.1032220200902799]


pca.explained_variance_ratio_.cumsum()

array([0.17245992, 0.27541561, 0.3702359 , 0.4627478 , 0.55402047,
       0.64465772, 0.73479823, 0.8237781 , 0.91199592, 0.99061524,
       1.        ])


clean_churn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9989 entries, 0 to 9988
Data columns (total 79 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   CaseOrder                     9989 non-null   int64  
 1   Customer_id                   9989 non-null   object 
 2   Interaction                   9989 non-null   object 
 3   City                          9989 non-null   object 
 4   State                         9989 non-null   object 
 5   County                        9989 non-null   object 
 6   Zip                           9989 non-null   int64  
 7   Lat                           9989 non-null   float64
 8   Lng                           9989 non-null   float64
 9   Population                    9989 non-null   int64  
 10  Area                          9989 non-null   object 
 11  Timezone                      9989 non-null   object 
 12  Job                           9989 non-null   object 
 13  Children                      9989 non-null   float64
 14  Age                           9989 non-null   float64
 15  Education                     9989 non-null   object 
 16  Employment                    9989 non-null   object 
 17  Income                        9989 non-null   float64
 18  Marital                       9989 non-null   object 
 19  Gender                        9989 non-null   object 
 20  Churn                         9989 non-null   object 
 21  Outage_sec_perweek            9989 non-null   float64
 22  Email                         9989 non-null   int64  
 23  Contacts                      9989 non-null   int64  
 24  Yearly_equip_failure          9989 non-null   int64  
 25  Techie                        9989 non-null   object 
 26  Contract                      9989 non-null   object 
 27  Port_modem                    9989 non-null   object 
 28  Tablet                        9989 non-null   object 
 29  InternetService               9989 non-null   object 
 30  Phone                         9989 non-null   object 
 31  Multiple                      9989 non-null   object 
 32  OnlineSecurity                9989 non-null   object 
 33  OnlineBackup                  9989 non-null   object 
 34  DeviceProtection              9989 non-null   object 
 35  TechSupport                   9989 non-null   object 
 36  StreamingTV                   9989 non-null   object 
 37  StreamingMovies               9989 non-null   object 
 38  PaperlessBilling              9989 non-null   object 
 39  PaymentMethod                 9989 non-null   object 
 40  Tenure                        9989 non-null   float64
 41  MonthlyCharge                 9989 non-null   float64
 42  Bandwidth_GB_Year             9989 non-null   float64
 43  timely_response               9989 non-null   int64  
 44  timely_fixes                  9989 non-null   int64  
 45  timely_replacements           9989 non-null   int64  
 46  reliability                   9989 non-null   int64  
 47  options                       9989 non-null   int64  
 48  respectful_response           9989 non-null   int64  
 49  courteous_exchange            9989 non-null   int64  
 50  evidence_of_active_listening  9989 non-null   int64  
 51  Timezone_numeric              9989 non-null   int64  
 52  Education_numeric             9989 non-null   int64  
 53  churn_numeric                 9989 non-null   int64  
 54  techie_numeric                9989 non-null   int64  
 55  port_modem_numeric            9989 non-null   int64  
 56  tablet_numeric                9989 non-null   int64  
 57  phone_numeric                 9989 non-null   int64  
 58  multiple_numeric              9989 non-null   int64  
 59  online_security_numeric       9989 non-null   int64  
 60  online_backup_numeric         9989 non-null   int64  
 61  device_protection_numeric     9989 non-null   int64  
 62  tech_support_numeric          9989 non-null   int64  
 63  streaming_tv_numeric          9989 non-null   int64  
 64  streaming_movies_numeric      9989 non-null   int64  
 65  paperless_billing_numeric     9989 non-null   int64  
 66  lat_z                         9989 non-null   float64
 67  lng_z                         9989 non-null   float64
 68  population_z                  9989 non-null   float64
 69  children_z                    9989 non-null   float64
 70  age_z                         9989 non-null   float64
 71  income_z                      9989 non-null   float64
 72  outage_sec_perweek_z          9989 non-null   float64
 73  email_z                       9989 non-null   float64
 74  contacts_z                    9989 non-null   float64
 75  yearly_equip_failure_z        9989 non-null   float64
 76  tenure_z                      9989 non-null   float64
 77  monthly_charge_z              9989 non-null   float64
 78  bandwidth_gb_year_z           9989 non-null   float64
dtypes: float64(22), int64(29), object(28)
memory usage: 6.0+ MB


msno.matrix(clean_churn, fontsize = 12, labels=True)
plt.title('Missing data matrix')
plt.show()


plt.plot(eigenvalues)
plt.xlabel('number of components')
plt.ylabel('eigenvalues')
plt.axhline(y=1, color='red')
plt.show()

Variable f	Data Type	Description	Example
Unnamed: 0	int64/quantitative	Index by integer	1
CaseOrder	int64/quantitative	Integer placeholder to preserve original order.	1
Customer_id	object/qualitative and quantitative	Unique character string to customer.	K409198
Interaction	object/ qualitative and quantitative	Unique character string related to customer transactions, technical support, and sign-ups.	aa90260b-4141-4a24-8e36-b04ce1f4f77b
City	object/qualitative	Character string for customers city of residence listed on billing statement.	Point Baker
State	object/qualitative	Character string for customers state of residence listed on billing statement.	AK
County	object/qualitative	Character string for customers county of residence listed on billing statement.	Prince of Wales-Hyder
Zip	int64/qualitative	Integer for customers zip code of residence on belling statement.	99927
Lat	float64/quantitative	Float with GPS coordinates of customer residence on billing statement	56.251
Lng	float64/quantitative	Float with GPS coordinates of customer residence on billing statement	-133.37571
Population	int64/quantitative	Integer indicating the population within a mile radius of customer based on census data	38
Area	object/qualitative	Character string that lists the area type based on census data	Urban
TimeZone	object/qualitative	Character string that indicates the time zone of the customers residence	America/Sitka
Job	object/qualitative	Character string indicating the job of the customer	Environmental health practitioner
Children	float64/quantitative	Float that indicates the number of children in the customer’s household.	2
Age	float64/quantitative	Float that indicates the age of the customer	44
Education	object/qualitative	Character string that shows the highest degree earned by customer	Master's Degree
Employment	object/qualitative	Character string that indicates the customer’s employment status	Part Time
Income	float64/quantitative	Float showing the annual income of customer	28561.99
Marital	object/qualitative	Character string indicating customer’s marital status	Widowed
Gender	object/qualitative	Character string indicating customer’s self-identification as male, female, or nonbinary	Male
Churn	object/qualitative	Character string indicating if within the last month the customer discontinued service.	No
Outage_sec_perweek	float64/quantitative	Float that indicates the average number of seconds per week of outages in the customer’s neighborhood	6.972566093
Email	int64/quantitative	Integer counting the number of emails sent to customer in the last year	10
Contacts	int64/quantitative	Integer counting the number of times the customer contacted technical support	0
Yearly_equip_failure	int64/quantitative	Integer counting the number of times the customer’s equipment failed and had to be reset or replaced in the last year	1
Techie	object/qualitative	Character string indicating if customer considers themselves technically inclined	No
Contract	object/qualitative	Character string indicating the contract term of the customer	One year
Port_modem	object/qualitative	Character string indicating whether the customer has a portable modem	Yes
Tablet	object/qualitative	Character string indicating if the customer owns a tablet	Yes
InternetService	object/qualitative	Character string indicating the type of internet service the customer has	Fiber Optic
Phone	object/qualitative	Character string indicating if the customer has a phone service	Yes
Multiple	object/qualitative	Character string indicating if customer has multiple lines	No
OnlineSecurity	object/qualitative	Character string indicating if customer has online security service	Yes
OnlineBackup	object/qualitative	Character string indicating if customer has online backup service	Yes
DeviceProtection	object/qualitative	Character string indicating if customer has Device Protection service	No
TechSupport	object/qualitative	Character string indicating if customer has Tech Support service	No
StreamingTV	object/qualitative	Character string indicating if customer has streaming TV	No
StreamingMovies	object/qualitative	Character string indicating if customer has streaming movies	Yes
PaperlessBilling	object/qualitative	Character string indicating if customer has paperless billing	Yes
PaymentMethod	object/qualitative	Character string indicating the customer’s payment method	Credit Card (automatic)
Tenure	float64/quantitative	Float counting the number of months the customer has been with the provider	6.795512947
MonthlyCharge	float64/quantitative	Float indicating the monthly average amount charged to the customer	171.4497621
Bandwidth_GB_Year	float64/quantitative	Float showing the yearly average amount of data in GB used by the customer	904.5361102
**The following variables are responses from a survey asking customers the importance of various factors on a scale of 1-8 (1 = most important, 8 = least important)
item1	int64/qualitative	Timely response	5
item2	int64/qualitative	Timely fixes	5
item3	int64/qualitative	Timely replacements	5
item4	int64/qualitative	Reliability	3
item5	int64/qualitative	Options	4
item6	int64/qualitative	Respectful response	4
item7	int64/qualitative	Courteous exchange	3
item8	int64/qualitative	Evidence of active Listening	4

	CaseOrder	Customer_id	Interaction	City	State	County	Zip	Lat	Lng	Population	...	MonthlyCharge	Bandwidth_GB_Year	item1	item2	item3	item4	item5	item6	item7	item8
0	1	K409198	aa90260b-4141-4a24-8e36-b04ce1f4f77b	Point Baker	AK	Prince of Wales-Hyder	99927	56.25100	-133.37571	38	...	171.449762	904.536110	5	5	5	3	4	4	3	4
1	2	S120509	fb76459f-c047-4a9d-8af9-e0f7d4ac2524	West Branch	MI	Ogemaw	48661	44.32893	-84.24080	10446	...	242.948015	800.982766	3	4	3	3	4	3	4	4
2	3	K191035	344d114c-3736-4be5-98f7-c72c281e2d35	Yamhill	OR	Yamhill	97148	45.35589	-123.24657	3735	...	159.440398	2054.706961	4	4	2	4	4	3	3	3
3	4	D90850	abfa2b40-2d43-4994-b15a-989b8c79e311	Del Mar	CA	San Diego	92014	32.96687	-117.24798	13863	...	120.249493	2164.579412	4	4	4	2	5	4	3	3
4	5	K662701	68a861fd-0d20-4e51-a587-8a90407ee574	Needville	TX	Fort Bend	77461	29.38012	-95.80673	11352	...	150.761216	271.493436	4	4	4	3	4	4	4	5

	Customer_id	Interaction	City	State	County	Zip	Lat	Lng	Population	Area	...	MonthlyCharge	Bandwidth_GB_Year	item1	item2	item3	item4	item5	item6	item7	item8
CaseOrder
1	K409198	aa90260b-4141-4a24-8e36-b04ce1f4f77b	Point Baker	AK	Prince of Wales-Hyder	99927	56.25100	-133.37571	38	Urban	...	171.449762	904.536110	5	5	5	3	4	4	3	4
2	S120509	fb76459f-c047-4a9d-8af9-e0f7d4ac2524	West Branch	MI	Ogemaw	48661	44.32893	-84.24080	10446	Urban	...	242.948015	800.982766	3	4	3	3	4	3	4	4
3	K191035	344d114c-3736-4be5-98f7-c72c281e2d35	Yamhill	OR	Yamhill	97148	45.35589	-123.24657	3735	Urban	...	159.440398	2054.706961	4	4	2	4	4	3	3	3
4	D90850	abfa2b40-2d43-4994-b15a-989b8c79e311	Del Mar	CA	San Diego	92014	32.96687	-117.24798	13863	Suburban	...	120.249493	2164.579412	4	4	4	2	5	4	3	3
5	K662701	68a861fd-0d20-4e51-a587-8a90407ee574	Needville	TX	Fort Bend	77461	29.38012	-95.80673	11352	Suburban	...	150.761216	271.493436	4	4	4	3	4	4	4	5

	Customer_id	Interaction	City	State	County	Zip	Lat	Lng	Population	Area	...	phone_numeric	multiple_numeric	online_security_numeric	online_backup_numeric	device_protection_numeric	tech_support_numeric	streaming_tv_numeric	streaming_movies_numeric	paperless_billing_numeric	lat_z
CaseOrder
2624	K111418	6b6ae769-af9e-48df-bb2a-8f01dfc5bebc	Wainwright	AK	North Slope	99782	70.64066	-159.93042	508	Suburban	...	1	1	0	0	1	1	1	0	1	5.863971
520	K296336	fe640326-8647-468b-ba07-b4b1ef988c67	Wainwright	AK	North Slope	99782	70.64066	-159.93042	508	Rural	...	1	0	0	0	1	0	1	0	1	5.863971
5957	I257218	1cb1ff82-8bf8-47da-ba60-bfdc59e9f86e	Prudhoe Bay	AK	North Slope	99734	70.36853	-148.99200	1458	Rural	...	0	1	0	0	1	1	0	0	0	5.813920
8296	D683409	ca28d5ea-ff21-4b2c-a4b2-1e1df7cccda7	Anchorage	AK	North Slope	99510	70.13850	-149.95980	949	Urban	...	1	1	0	0	1	1	1	0	0	5.771613
572	D878040	3376a1d6-bede-45c0-84cf-1e059b03cb50	Anchorage	AK	North Slope	99510	70.13850	-149.95982	949	Suburban	...	1	0	0	0	0	0	1	0	0	5.771613

	Customer_id	Interaction	City	State	County	Zip	Lat	Lng	Population	Area	...	multiple_numeric	online_security_numeric	online_backup_numeric	device_protection_numeric	tech_support_numeric	streaming_tv_numeric	streaming_movies_numeric	paperless_billing_numeric	lat_z	lng_z
CaseOrder
5408	D120097	4fde3ffc-1d9e-4087-a1cf-e257e73b0034	Tok	AK	Southeast Fairbanks	99780	63.19467	-143.07800	1559	Urban	...	1	0	0	0	1	1	1	1	4.494497	-3.450620
1206	B756017	3932e10c-26a1-4291-8344-adef110b9350	Fort Yukon	AK	Yukon-Koyukuk	99740	67.53438	-143.74520	533	Urban	...	1	0	0	1	0	1	1	1	5.292660	-3.494644
9874	J547555	0b892c49-09c3-4da5-b3a6-40f9c5844ba4	Circle	AK	Yukon-Koyukuk	99733	65.79284	-144.18280	74	Suburban	...	0	0	0	0	0	0	0	1	4.972355	-3.523518
4541	V157744	c10943a0-85dd-4408-bb6e-a90745a4810e	Copper Center	AK	Valdez-Cordova	99573	61.62257	-145.99910	540	Suburban	...	0	0	0	0	0	0	0	1	4.205354	-3.643363
2926	E261748	2eec95d1-0ffb-4d09-83a6-c60681cdfe31	Venetie	AK	Yukon-Koyukuk	99781	67.47706	-146.01946	141	Urban	...	1	0	0	1	1	0	1	1	5.282118	-3.644706

	Customer_id	Interaction	City	State	County	Zip	Lat	Lng	Population	Area	...	online_security_numeric	online_backup_numeric	device_protection_numeric	tech_support_numeric	streaming_tv_numeric	streaming_movies_numeric	paperless_billing_numeric	lat_z	lng_z	population_z
CaseOrder
8140	G336446	b01df5d1-0236-4c13-ac2d-dd71c34f50f8	Chicago	IL	Cook	60629	41.77567	-87.71176	111850	Suburban	...	0	1	0	0	1	1	1	0.555093	0.202619	7.074113
8321	Y465694	ec724419-c0d1-41d5-8738-ef1c615bfa0c	Bronx	NY	Bronx	10467	40.88085	-73.87394	103732	Urban	...	1	1	1	0	1	1	0	0.390516	1.115682	6.511612
6289	A40824	6303b691-2f39-4b57-8a8c-df0e076a96a6	Bell Gardens	CA	Los Angeles	90201	33.97074	-118.17080	102433	Rural	...	1	0	1	1	0	0	0	-0.880398	-1.807164	6.421604
1776	C544905	5c202e0c-6416-483a-9295-5d96d87a6983	Brooklyn	NY	Kings	11208	40.66853	-73.87089	98660	Suburban	...	1	0	0	0	1	1	1	0.351466	1.115884	6.160170
6611	Y79840	6feec1b5-b842-4596-97ba-6fe5ad3444d9	Fontana	CA	San Bernardino	92336	34.14649	-117.46390	96575	Suburban	...	0	1	1	1	1	1	1	-0.848074	-1.760521	6.015700

	Age	Children	children_z
CaseOrder
4744	40.000000	10.0	4.060712
5779	53.275748	10.0	4.060712
2180	44.000000	10.0	4.060712
5978	29.000000	10.0	4.060712
6195	36.000000	10.0	4.060712
6455	86.000000	10.0	4.060712
6702	47.000000	10.0	4.060712
3654	23.000000	10.0	4.060712
7128	53.275748	10.0	4.060712
1634	53.275748	10.0	4.060712

	Age	Children	children_z
CaseOrder
6674	19.000000	7.0	2.615104
7966	89.000000	7.0	2.615104
9012	27.000000	7.0	2.615104
6889	62.000000	7.0	2.615104
6892	53.275748	7.0	2.615104
9844	33.000000	7.0	2.615104
4095	29.000000	7.0	2.615104
5177	53.275748	7.0	2.615104
5786	89.000000	7.0	2.615104
1869	52.000000	7.0	2.615104

	Age	Income	income_z
CaseOrder
4250	56.000000	258900.70	8.916149
9181	69.000000	256998.40	8.839278
5802	53.275748	212255.30	7.031230
6838	35.000000	194550.70	6.315795
3986	76.000000	189938.40	6.129414
...	...	...	...
4734	53.275748	951.96	-1.507440
5462	62.000000	945.47	-1.507702
4170	78.000000	938.81	-1.507971
2526	22.000000	901.21	-1.509490
1897	53.275748	740.66	-1.515978

	Outage_sec_perweek	outage_sec_perweek_z
CaseOrder
7390	0.994552	-1.488620
8181	0.915846	-1.499823
8192	0.852520	-1.508836
2985	0.840953	-1.510483
9403	0.683623	-1.532877
7071	0.359073	-1.579072
4698	0.278712	-1.590511
909	0.169351	-1.606077
7340	0.113821	-1.613981
6464	-0.144644	-1.650770
3630	-0.152845	-1.651937
3070	-0.206145	-1.659524
8195	-0.214328	-1.660689
1998	-0.339214	-1.678464
4185	-0.352431	-1.680346
6578	-0.527396	-1.705250
6094	-0.787115	-1.742217
4428	-1.099934	-1.786743
1905	-1.195428	-1.800336
4168	-1.348571	-1.822134

	Age	Bandwidth_GB_Year	Outage_sec_perweek	outage_sec_perweek_z
CaseOrder
8775	33.000000	6089.067000	1.049154	-1.480848
7390	83.000000	6450.351000	0.994552	-1.488620
8181	70.000000	5588.562000	0.915846	-1.499823
8192	84.000000	4771.553000	0.852520	-1.508836
2985	70.000000	1911.199623	0.840953	-1.510483
9403	53.275748	5441.578000	0.683623	-1.532877
7071	53.275748	6447.107000	0.359073	-1.579072
4698	53.275748	654.458600	0.278712	-1.590511
909	75.000000	3382.424000	0.169351	-1.606077
7340	53.275748	4790.241000	0.113821	-1.613981

	Contacts	contacts_z
CaseOrder
89	4	3.040865
6357	4	3.040865
6390	4	3.040865
6429	4	3.040865
6454	4	3.040865

	Yearly_equip_failure	yearly_equip_failure_z
CaseOrder
9	3	4.090424
7112	3	4.090424
7019	3	4.090424
6948	3	4.090424
6587	3	4.090424
6532	3	4.090424
6441	3	4.090424
6367	3	4.090424
6307	3	4.090424
7186	3	4.090424
6197	3	4.090424
6132	3	4.090424
6099	3	4.090424
6051	3	4.090424
5892	3	4.090424
5769	3	4.090424
5575	3	4.090424
5224	3	4.090424
9770	3	4.090424
6144	3	4.090424
7223	3	4.090424
7279	3	4.090424
7332	3	4.090424
9675	3	4.090424
9584	3	4.090424
9423	3	4.090424
9342	3	4.090424
9176	3	4.090424
9109	3	4.090424
8992	3	4.090424
8981	3	4.090424
8492	3	4.090424
8314	3	4.090424
8282	3	4.090424
8063	3	4.090424
8055	3	4.090424
7891	3	4.090424
7646	3	4.090424
7575	3	4.090424
7455	3	4.090424
7351	3	4.090424
7348	3	4.090424
5157	3	4.090424
5076	3	4.090424
9968	3	4.090424
4696	3	4.090424
5058	3	4.090424
2274	3	4.090424
1940	3	4.090424
1842	3	4.090424
1635	3	4.090424
1592	3	4.090424
1478	3	4.090424
1379	3	4.090424
2375	3	4.090424
1307	3	4.090424
1109	3	4.090424
858	3	4.090424
711	3	4.090424
698	3	4.090424
622	3	4.090424
593	3	4.090424
172	3	4.090424
21	3	4.090424
1190	3	4.090424
2663	3	4.090424
2336	3	4.090424
2939	3	4.090424
4473	3	4.090424
4384	3	4.090424
4350	3	4.090424
4343	3	4.090424
4269	3	4.090424
2819	3	4.090424
3996	3	4.090424
4687	3	4.090424
3937	3	4.090424
3602	3	4.090424
3552	3	4.090424
3451	3	4.090424
3325	3	4.090424
3270	3	4.090424
3238	3	4.090424
3023	3	4.090424
3900	3	4.090424
4587	3	4.090424
9624	4	5.662538
9764	4	5.662538
1117	4	5.662538
9387	4	5.662538
1229	4	5.662538
6346	4	5.662538
5167	4	5.662538
5472	6	8.806766

	Tenure	tenure_z
CaseOrder
9483	71.999280	1.482534
8493	71.994180	1.482331
8732	71.993730	1.482313
8480	71.991320	1.482218
7772	71.988280	1.482097
...	...	...
2538	1.025579	-1.335826
1677	1.019306	-1.336075
1064	1.018520	-1.336106
3888	1.005104	-1.336639
1952	1.000259	-1.336831

	Income	Port_modem	InternetService	Phone	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Bandwidth_GB_Year	MonthlyCharge	monthly_charge_z
CaseOrder
3747	33186.785	No	Fiber Optic	No	No	Yes	Yes	Yes	Yes	Yes	1288.595000	315.878600	3.272810
928	7598.570	No	Fiber Optic	Yes	Yes	Yes	Yes	No	Yes	Yes	1482.558282	307.528124	3.080080
4701	52866.310	Yes	Fiber Optic	Yes	Yes	Yes	Yes	No	Yes	Yes	1383.764000	306.268000	3.050996
799	4297.280	No	DSL	Yes	Yes	Yes	Yes	Yes	Yes	Yes	2312.167935	299.206164	2.888008
1431	29315.240	Yes	DSL	Yes	Yes	Yes	Yes	Yes	Yes	Yes	1998.877156	298.173023	2.864163
...	...	...	...	...	...	...	...	...	...	...	...	...	...
5948	8324.790	Yes	None	Yes	No	No	No	No	No	No	4411.075000	78.314360	-2.210210
4327	23513.120	No	None	Yes	No	No	No	No	No	No	638.163700	78.264670	-2.211357
5423	11609.430	Yes	None	Yes	No	No	No	No	No	No	4345.907000	78.052880	-2.216245
4603	33186.785	Yes	None	Yes	No	No	No	No	No	No	1346.570000	77.924520	-2.219208
3577	56246.750	Yes	None	Yes	No	No	No	No	No	No	1333.702000	77.505230	-2.228885

	Bandwidth_GB_Year	bandwidth_gb_year_z
CaseOrder
9947	7158.982000	1.814641
6991	7138.309000	1.804668
9043	7096.495000	1.784495
8894	7084.803000	1.778854
8807	7081.307000	1.777168
...	...	...
433	243.261776	-1.521791
1768	242.994755	-1.521920
2760	223.476583	-1.531336
400	169.399280	-1.557425
2710	155.506715	-1.564128

	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9	PC10	PC11
Population	0.000018	-0.062709	-0.451202	-0.048610	-0.058525	0.454502	0.571390	0.430314	-0.261657	0.001489	0.000813
Children	-0.011579	0.060135	0.467966	0.080407	-0.112091	-0.426230	0.535013	0.430112	0.324149	0.006936	0.014138
Age	-0.013055	-0.040140	0.031768	0.750710	-0.185684	-0.105312	0.282311	-0.399292	-0.365571	0.121717	-0.022007
Income	0.006340	-0.003116	0.232227	0.044735	0.780565	0.310045	0.330778	-0.299581	0.185186	-0.071780	-0.000917
Outage_sec_perweek	0.021872	0.704870	-0.019081	-0.040700	0.030825	0.093441	-0.000259	-0.001251	0.028312	0.700125	-0.000459
Email	-0.021558	0.046083	-0.577669	-0.052690	-0.156575	-0.206492	0.267715	-0.398428	0.600262	-0.054966	-0.005259
Contacts	0.005041	-0.009947	-0.233205	0.639115	0.222944	0.105428	-0.354898	0.450085	0.384269	0.001903	0.002835
Yearly_equip_failure	0.015284	0.058707	0.361718	0.078895	-0.514320	0.661439	-0.016972	-0.135752	0.350474	-0.125191	0.002645
Tenure	0.704903	-0.057819	-0.011001	0.008112	-0.007013	-0.013437	0.011535	-0.019566	0.005019	0.038692	0.705205
MonthlyCharge	0.045496	0.696351	-0.055401	0.079927	0.022286	-0.063463	0.002997	0.021232	-0.163708	-0.685176	0.048144
Bandwidth_GB_Year	0.706749	-0.008492	-0.002469	-0.005065	-0.002905	-0.018481	0.010373	0.008106	0.009550	-0.012128	-0.706852

Data Cleaning Performance Assessment

Sebastian Granados

Western Governors University

D206 - Data Cleaning

Dr. Keiona Middleton

March 3, 2023

Part I: Research Question and Variables¶

A. Research Question¶

B. Variables¶

Part II: Data-Cleaning Plan¶

C1. Methods to Detect Anomalies¶

C2. Method Justification¶

C3. Python Language Justification¶

C4. Python Code¶

Import needed libraries and packages¶

Import telecommunications churn data set¶

Checking for duplicates¶

Remove unnecessary variables/columns¶

Use CaseOrder as index¶

Renaming Ambiguous variables¶

Detecting Missing Values¶

Impute missing values¶

Re-express Categorical Variables¶

Identifying and Treating Outliers¶

Extract clean data¶

Principal Component Analysis¶

Part III: Data Cleaning (Treatment)¶

D1. Cleaning Findings¶

D2. Cleaning Anomalies and Justification¶

D3. Cleaning Summary¶

D4. Cleaning Python Code¶

D5. Clean Data¶

D6. Limitations¶

D7. Challenges¶

Part IV: PCA¶

E1. Variables¶

E2. Retained PC's¶

E3. Benefits¶

F. Panopto Recording¶

G. Third-Party Code References¶

H. References¶