import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model
import statsmodels.api as sm
from scipy import stats


covid_vaccine_data = pd.read_csv("country_vaccinations.csv")
vaccine_data = covid_vaccine_data.copy(deep=True)
vaccine_data


covid_daily_data = pd.read_csv("worldometer_coronavirus_daily_data.csv")
daily_data = covid_daily_data.copy(deep=True)
daily_data


# Drop rows for dates we do not need
rows_to_drop = []

# Cycle through the daily data table by row
for index, row in daily_data.iterrows():
    if daily_data.loc[index,'date'][0:4] == '2020' or daily_data.loc[index,'date'][0:6] == '2021-1' or \
    (daily_data.loc[index,'date'][0:6] == '2021-2' and int(daily_data.loc[index,'date'][7:9]) < 22):
        rows_to_drop.append(index)
        
    # Capitalizing the country name
    daily_data.loc[index,'country'] = daily_data.loc[index,'country'].upper()
        
# drop the rows
daily_data = daily_data.drop(rows_to_drop)
daily_data


# Drop rows for dates we do not need
rows_to_drop = []

# Cycle through the vaccinations data table by row
for index, row in vaccine_data.iterrows():
    if vaccine_data.loc[index,'date'][0:10] == '2021-05-12':
        rows_to_drop.append(index)
        
    # Capitalizing the country name
    vaccine_data.loc[index,'country'] = vaccine_data.loc[index,'country'].upper()
        
# drop the rows
vaccine_data = vaccine_data.drop(rows_to_drop)
vaccine_data


# Count the countries in each table
vaccine_countries = []
for index, row in vaccine_data.iterrows():
    if vaccine_data.loc[index,'country'] not in vaccine_countries:
        vaccine_countries.append(vaccine_data.loc[index,'country'])

daily_data_countries = []
for index, row in daily_data.iterrows():
    if daily_data.loc[index,'country'] not in daily_data_countries:
        daily_data_countries.append(daily_data.loc[index,'country'])

print('Number of countries in vaccine table: ' + str(len(vaccine_countries)))
print('Number of countries in daily data table: ' + str(len(daily_data_countries)))

Number of countries in vaccine table: 205
Number of countries in daily data table: 220


# Deciding for which countries we need to drop rows.
drop_countries = []

# Checking vaccine data
for country in vaccine_countries:
    if country not in daily_data_countries:
        drop_countries.append(country)
        
for country in daily_data_countries:
    if country not in vaccine_countries:
        drop_countries.append(country)
        
print(drop_countries)
print('\nNumber of countries to drop:' + str(len(drop_countries)))

['BONAIRE SINT EUSTATIUS AND SABA', 'BRUNEI', 'CAPE VERDE', "COTE D'IVOIRE", 'CZECHIA', 'DEMOCRATIC REPUBLIC OF CONGO', 'ENGLAND', 'ESWATINI', 'FALKLAND ISLANDS', 'GUERNSEY', 'HONG KONG', 'JERSEY', 'KOSOVO', 'MACAO', 'NAURU', 'NORTH MACEDONIA', 'NORTHERN CYPRUS', 'NORTHERN IRELAND', 'PALESTINE', 'SCOTLAND', 'SINT MAARTEN (DUTCH PART)', 'TIMOR', 'TONGA', 'TURKMENISTAN', 'TUVALU', 'UNITED KINGDOM', 'UNITED STATES', 'VIETNAM', 'WALES', 'WALLIS AND FUTUNA', 'BENIN', 'BRITISH VIRGIN ISLANDS', 'BRUNEI DARUSSALAM', 'BURKINA FASO', 'BURUNDI', 'CABO VERDE', 'CARIBBEAN NETHERLANDS', 'CENTRAL AFRICAN REPUBLIC', 'CHAD', 'CHANNEL ISLANDS', 'CHINA HONG KONG SAR', 'CHINA MACAO SAR', 'COTE D IVOIRE', 'CUBA', 'CZECH REPUBLIC', 'DEMOCRATIC REPUBLIC OF THE CONGO', 'ERITREA', 'FALKLAND ISLANDS MALVINAS', 'FRENCH GUIANA', 'GUADELOUPE', 'GUINEA BISSAU', 'HAITI', 'HOLY SEE', 'LIBERIA', 'MACEDONIA', 'MADAGASCAR', 'MARSHALL ISLANDS', 'MARTINIQUE', 'MAYOTTE', 'MICRONESIA', 'REUNION', 'SAINT BARTHELEMY', 'SAINT MARTIN', 'SAINT PIERRE AND MIQUELON', 'SINT MAARTEN', 'STATE OF PALESTINE', 'SWAZILAND', 'TANZANIA', 'TIMOR LESTE', 'UK', 'USA', 'VANUATU', 'VIET NAM', 'WALLIS AND FUTUNA ISLANDS', 'WESTERN SAHARA']

Number of countries to drop:75


# Filtering out the names to save
saved = ['UNITED KINGDOM','UK','UNITED STATES','USA','VIETNAM','VIET NAM','STATE OF PALESTINE','PALESTINE' \
        ,'NORTH MACEDONIA','MACEDONIA']

unsaved = [] # Creating a new list because of a remove error trying to remove from drop_countries
for country in drop_countries:
    if country not in saved:
        unsaved.append(country[0:])

rows_to_drop = []
# Going through vaccine table to change names as necessary 
# and mark rows that we need to drop
for index, row in vaccine_data.iterrows():
    if vaccine_data.loc[index,'country'] == 'UK':
        vaccine_data.loc[index,'country'] = 'UNITED KINGDOM'
    elif vaccine_data.loc[index,'country'] == 'USA':
        vaccine_data.loc[index,'country'] = 'UNITED STATES'
    elif vaccine_data.loc[index,'country'] == 'VIET NAM':
        vaccine_data.loc[index,'country'] = 'VIETNAM'
    elif vaccine_data.loc[index,'country'] == 'STATE OF PALESTINE':
        vaccine_data.loc[index,'country'] = 'PALESTINE'
    elif vaccine_data.loc[index,'country'] == 'NORTH MACEDONIA':
        vaccine_data.loc[index,'country'] = 'MACEDONIA'
        
    # marking row if needed to be dropped
    if vaccine_data.loc[index,'country'] in unsaved:
        rows_to_drop.append(index)

# drop marked rows from vaccine table
vaccine_data = vaccine_data.drop(rows_to_drop)


# Repeat the same name change and row drop process as above but with the daily data table instead.
rows_to_drop = []

for index, row in daily_data.iterrows():
    if daily_data.loc[index,'country'] == 'UK':
        daily_data.loc[index,'country'] = 'UNITED KINGDOM'
    elif daily_data.loc[index,'country'] == 'USA':
        daily_data.loc[index,'country'] = 'UNITED STATES'
    elif daily_data.loc[index,'country'] == 'VIET NAM':
        daily_data.loc[index,'country'] = 'VIETNAM'
    elif daily_data.loc[index,'country'] == 'STATE OF PALESTINE':
        daily_data.loc[index,'country'] = 'PALESTINE'
    elif daily_data.loc[index,'country'] == 'NORTH MACEDONIA':
        daily_data.loc[index,'country'] = 'MACEDONIA'
        
    # marking row if needed to be dropped
    if daily_data.loc[index,'country'] in unsaved:
        rows_to_drop.append(index)

daily_data = daily_data.drop(rows_to_drop)


print('Rows in vaccine_data: ' + str(len(vaccine_data.index)))
print('Rows in daily_data: ' + str(len(daily_data.index)))

Rows in vaccine_data: 15958
Rows in daily_data: 14220


# Count the countries in each table
vaccine_countries = []
for index, row in vaccine_data.iterrows():
    if vaccine_data.loc[index,'country'] not in vaccine_countries:
        vaccine_countries.append(vaccine_data.loc[index,'country'])

daily_data_countries = []
for index, row in daily_data.iterrows():
    if daily_data.loc[index,'country'] not in daily_data_countries:
        daily_data_countries.append(daily_data.loc[index,'country'])

print('Number of countries in vaccine table: ' + str(len(vaccine_countries)))
print('Number of countries in daily data table: ' + str(len(daily_data_countries)))
vaccine_countries.sort()
daily_data_countries.sort()
if vaccine_countries == daily_data_countries:
    print('Countries in both lists are the same.')

Number of countries in vaccine table: 180
Number of countries in daily data table: 180
Countries in both lists are the same.


# calculate rate of daily cases for each country
country_rate_daily_cases = {} # ex: Country: [3,4,5] 
for index, row in daily_data.iterrows():
    if daily_data.loc[index, 'country'] in country_rate_daily_cases:
        temp_lst = country_rate_daily_cases[daily_data.loc[index, 'country']]
        temp_lst.append(daily_data.loc[index, 'daily_new_cases'])
        country_rate_daily_cases[daily_data.loc[index, 'country']] = temp_lst
        
    else:
        country_rate_daily_cases[daily_data.loc[index, 'country']] = [daily_data.loc[index, 'daily_new_cases']]
        
# Calculate rate for every country
country_case_rate = [] # ex: [(Country,rate)]
for country, val in country_rate_daily_cases.items():
    x_lst = [i for i in range(0,len(val))]
    slope, intercept = np.polyfit(x_lst, val, 1)
    country_case_rate.append((country,slope))
    
country_case_rate.sort()


# gather daily vaccinations for each country
country_rate_vac = {} # ex: Country: [3,4,5] 
# create a list to keep track of skipped countries
skipped_nan = []
for index, row in vaccine_data.iterrows():
    # Check for a NaN, and if present, mark the country then skip this row
    if str(vaccine_data.loc[index, 'daily_vaccinations']) == 'nan':
        if vaccine_data.loc[index, 'country'] not in skipped_nan:
            skipped_nan.append(vaccine_data.loc[index, 'country'])
        continue
    if vaccine_data.loc[index, 'country'] in country_rate_vac:
        temp_lst = country_rate_vac[vaccine_data.loc[index, 'country']]
        temp_lst.append(vaccine_data.loc[index, 'daily_vaccinations'])
        country_rate_vac[vaccine_data.loc[index, 'country']] = temp_lst
    else:
        country_rate_vac[vaccine_data.loc[index, 'country']] = [vaccine_data.loc[index, 'daily_vaccinations']]
    
# Calculate the rate for every country
country_vac_rate = [] # ex: [(Country,rate)]
for country, val in country_rate_vac.items():
    x_lst = [i for i in range(0,len(val))]
    slope, intercept = np.polyfit(x_lst, val, 1)
    country_vac_rate.append((country, slope))
    
country_vac_rate.sort()


print('Length of vaccination rates list: ' + str(len(country_vac_rate)))
print('Length of case rates list: ' + str(len(country_case_rate)))

Length of vaccination rates list: 178
Length of case rates list: 180


for skipped in skipped_nan:
    if skipped not in country_rate_vac:
        print('Completely skipped: ' + str(skipped))

Completely skipped: TAJIKISTAN
Completely skipped: YEMEN


for country, rate in country_case_rate:
    if country == 'TAJIKISTAN' or country == 'YEMEN':
        country_case_rate.remove((country,rate))
print('Length of vaccination rates list: ' + str(len(country_vac_rate)))
print('Length of case rates list: ' + str(len(country_case_rate)))

Length of vaccination rates list: 178
Length of case rates list: 178


scatter1_x = np.array([rate for (country, rate) in country_vac_rate])
scatter1_y = np.array([rate for (country, rate) in country_case_rate])
plt.subplots(figsize=(16,10))
plt.scatter(scatter1_x,scatter1_y)
plt.xlabel('Rate of Daily Vaccinations by Country')
plt.ylabel('Rate of Daily Cases by Country')
plt.title('Rate of Daily Cases vs Rate of Daily Vaccinations in 2021')
plt.show()


# Go through vaccination rates
for (country, rate) in country_vac_rate:
    if (rate > 7000):
        print('High vaccination rate: ' + str(country))

High vaccination rate: CHINA
High vaccination rate: INDIA
High vaccination rate: UNITED STATES


for country, rate in country_case_rate:
    if country == 'CHINA' or country == 'INDIA' or country == 'UNITED STATES':
        country_case_rate.remove((country,rate))

for country, rate in country_vac_rate:
    if country == 'CHINA' or country == 'INDIA' or country == 'UNITED STATES':
        country_vac_rate.remove((country,rate))


scatter2_x = np.array([rate for (country, rate) in country_vac_rate])
scatter2_y = np.array([rate for (country, rate) in country_case_rate])
plt.subplots(figsize=(16,10))
plt.scatter(scatter2_x,scatter2_y)
plt.xlabel('Rate of Daily Vaccinations by Country')
plt.ylabel('Rate of Daily Cases by Country')
plt.title('Rate of Daily Cases vs Rate of Daily Vaccinations in 2021')
plt.show()


# calculate rate of daily cases for each country by month
country_rate_daily_cases = {} # ex: Country: [[2,3,4],[3,4,5]] ---> [(1,Country,rate), ...]
for index, row in daily_data.iterrows():
    month = int(daily_data.loc[index,'date'][5:6]) - 2 #use month-2 because we count from Feb, indexed to 0.
    if daily_data.loc[index, 'country'] in country_rate_daily_cases:
        country_rate_daily_cases[daily_data.loc[index, 'country']][month-2]\
        .append(daily_data.loc[index,'daily_new_cases'])
        
    else:
        country_rate_daily_cases[daily_data.loc[index,'country']] = [[],[],[],[]]
        country_rate_daily_cases[daily_data.loc[index,'country']][month-2]\
        .append(daily_data.loc[index,'daily_new_cases'])

# Calculate rate for every country
country_case_rate = []
for country, val in country_rate_daily_cases.items(): # val is a list of lists here
    for val_index in range(0,len(val)):
        x_lst = [i for i in range(0,len(val[val_index]))]
        slope, intercept = np.polyfit(x_lst, val[val_index], 1)
        country_case_rate.append((val_index,country,slope))
    
country_case_rate.sort()


# calculate rate of daily vaccinations for each country by month
country_rate_daily_vac = {} # ex: Country: [[2,3,4],[3,4,5]] ---> [(1,Country,rate), ...]
for index, row in vaccine_data.iterrows():
    # Check for a NaN, and if present, mark the country then skip this row
    if str(vaccine_data.loc[index, 'daily_vaccinations']) == 'nan':
        continue
    month = int(vaccine_data.loc[index,'date'][6:7]) - 2 #use month-2 because we count from Feb, indexed to 0.
    if vaccine_data.loc[index, 'country'] in country_rate_daily_vac:
        country_rate_daily_vac[vaccine_data.loc[index, 'country']][month-2]\
        .append(vaccine_data.loc[index,'daily_vaccinations'])
        
    else:
        country_rate_daily_vac[vaccine_data.loc[index,'country']] = [[],[],[],[]]
        country_rate_daily_vac[vaccine_data.loc[index,'country']][month-2]\
        .append(vaccine_data.loc[index,'daily_vaccinations'])

# Calculate rate for every country
country_vac_rate = []
for country, val in country_rate_daily_vac.items():
    for val_index in range(0,len(val)):
        if len(val[val_index]) > 0:
            x_lst = [i for i in range(0,len(val[val_index]))]
#             print(len(x_lst) == len(lst))
#             print(lst)
            try:
                slope, intercept = np.polyfit(x_lst, val[val_index], 1)
            except:
                continue
            country_vac_rate.append((val_index, country, slope))
    
country_vac_rate.sort()


# Storing the rates as values to the 2-tuple (of month and country) key.
case_rate_dict = {}
for (month,country,rate) in country_case_rate:
    case_rate_dict[(month,country)] = rate
vac_rate_dict = {}
for (month,country,rate) in country_vac_rate:
    vac_rate_dict[(month,country)] = rate
    
# removing unshared values
to_remove = []
for key, val in case_rate_dict.items():
    if key not in vac_rate_dict or str(val) == 'nan':
        to_remove.append(key)
        
for remove_item in to_remove:
    case_rate_dict.pop(remove_item,None)
    
to_remove = []
for key, val in vac_rate_dict.items():
    if key not in case_rate_dict or str(val) == 'nan':
        to_remove.append(key)
        
for remove_item in to_remove:
    vac_rate_dict.pop(remove_item,None)
    
if len(case_rate_dict) == len(vac_rate_dict):
    print('The unshared values have been removed')
    
# removing outliers
to_remove = []
for key, val in vac_rate_dict.items():
    if val > 20000:
        to_remove.append(key)
for remove_item in to_remove:
    vac_rate_dict.pop(remove_item,None)
    case_rate_dict.pop(remove_item,None)
    
to_remove = []
for key, val in case_rate_dict.items():
    if val > 1800:
        to_remove.append(key)
for remove_item in to_remove:
    vac_rate_dict.pop(remove_item,None)
    case_rate_dict.pop(remove_item,None)

The unshared values have been removed


scatter3_x = np.array([rate for (month, country), rate in vac_rate_dict.items()])
scatter3_y = np.array([rate for (month, country), rate in case_rate_dict.items()])
plt.subplots(figsize=(16,10))
plt.scatter(scatter3_x,scatter3_y, color=['green'])
plt.xlabel('Rate of Daily Vaccinations by Country Per Month')
plt.ylabel('Rate of Daily Cases by Country Per Month')
plt.title('Rate of Daily Cases vs Rate of Daily Vaccinations by Month in 2021')
plt.show()


# calculate the monthly cumulative vaccinated people per hundred
monthly_cumulative_vac = {} # ex: Country: [3000,0,0,0] 
for index, row in vaccine_data.iterrows():
    # Check for a NaN, and if present, mark the country then skip this row
    if str(vaccine_data.loc[index, 'people_fully_vaccinated_per_hundred']) == 'nan':
        continue
    month = int(vaccine_data.loc[index,'date'][6:7]) - 2 #use month-2 because we count from Feb, indexed to 0.
    if vaccine_data.loc[index, 'country'] in monthly_cumulative_vac:
        monthly_cumulative_vac[vaccine_data.loc[index, 'country']][month-2]\
        = vaccine_data.loc[index,'people_fully_vaccinated_per_hundred']
        
    else:
        monthly_cumulative_vac[vaccine_data.loc[index,'country']] = [0,0,0,0]
        monthly_cumulative_vac[vaccine_data.loc[index,'country']][month-2]\
        = vaccine_data.loc[index,'people_fully_vaccinated_per_hundred']

# Calculate rate for every country
country_vac_curr = [] # ex: [(1,Country,rate), ...]

for country, val in monthly_cumulative_vac.items():
    for val_index in range(0, len(val)):
        if val[val_index] != 0:
            country_vac_curr.append((val_index, country, val[val_index]))
    
country_vac_curr.sort()


# Storing the rates as values to the 2-tuple (of month and country) key.
case_rate_dict = {}
for (month,country,rate) in country_case_rate:
    case_rate_dict[(month,country)] = rate
vac_num_dict = {}
for (month,country,vac_count) in country_vac_curr:
    vac_num_dict[(month,country)] = vac_count
    
# removing unshared values
to_remove = []
for key, val in case_rate_dict.items():
    if key not in vac_rate_dict or str(val) == 'nan':
        to_remove.append(key)
        
for remove_item in to_remove:
    case_rate_dict.pop(remove_item,None)
    
to_remove = []
for key, val in vac_num_dict.items():
    if key not in case_rate_dict or str(val) == 'nan':
        to_remove.append(key)
        
for remove_item in to_remove:
    vac_num_dict.pop(remove_item,None)
    
if len(case_rate_dict) == len(vac_num_dict):
    print('The unshared values have been removed')
    
# removing outliers
to_remove = []
for key, val in vac_num_dict.items():
    if val > 60:
        to_remove.append(key)
for remove_item in to_remove:
    vac_num_dict.pop(remove_item,None)
    case_rate_dict.pop(remove_item,None)
    
to_remove = []
for key, val in case_rate_dict.items():
    if abs(val) > 600:
        to_remove.append(key)
for remove_item in to_remove:
    vac_num_dict.pop(remove_item,None)
    case_rate_dict.pop(remove_item,None)

The unshared values have been removed


scatter4_x = np.array([vac_count for (month, country), vac_count in vac_num_dict.items()])
scatter4_y = np.array([rate for (month, country), rate in case_rate_dict.items()])
plt.subplots(figsize=(16,10))
plt.scatter(scatter4_x,scatter4_y, color=['orange'])
plt.xlabel('Cumulative Number of Those Fully Vaccinated per Hundred People by Country Per Month')
plt.ylabel('Rate of Daily Cases by Country Per Month')
plt.title('Rate of Daily Cases vs Cumulative Number of Those Fully Vaccinated per Hundred People by Month in 2021')
plt.show()


# Clear any potential NaN's
ind = []
for vac_count in scatter4_x:
    if str(vac_count) != 'nan':
        ind.append([vac_count])
dep = []
for rate in scatter4_y:
    if str(rate) != 'nan':
        dep.append(rate)
        
lm = linear_model.LinearRegression()
lm.fit(ind,dep)

b1 = lm.coef_[0]
b0 = lm.intercept_
print('Linear regression line for:')
print('Rate of Daily Cases vs Cumulative Number of Those Fully Vaccinated per Hundred People by Month is')
print('y = ' + str(b1) + 'x + ' + str(b0))

Linear regression line for:
Rate of Daily Cases vs Cumulative Number of Those Fully Vaccinated per Hundred People by Month is
y = -1.3722702484693745x + 9.802660202474815


scatter5_x = np.array([vac_count for (month, country), vac_count in vac_num_dict.items()])
scatter5_y = np.array([rate for (month, country), rate in case_rate_dict.items()])
figure, ax = plt.subplots(figsize=(16,10))
plt.scatter(scatter5_x,scatter5_y, color=['orange'])
ax.plot(scatter5_x, b1*scatter5_x + b0, color='b') # plotting regression line
plt.xlabel('Cumulative Number of Those Fully Vaccinated per Hundred People by Country Per Month')
plt.ylabel('Rate of Daily Cases by Country Per Month')
plt.title('Rate of Daily Cases vs Cumulative Number of Those Fully Vaccinated per Hundred People by Month in 2021')
plt.show()


ind2 = sm.add_constant(ind) # Modifying the independent variable as per the documentation
statsModel = sm.OLS(dep, ind2) # OLS is ordinary least squares as we just recently discussed
statsResults = statsModel.fit()
print('P-value: ' + str(statsResults.f_pvalue))

P-value: 0.013826826339792809

	country	iso_code	date	total_vaccinations	people_vaccinated	people_fully_vaccinated	daily_vaccinations_raw	daily_vaccinations	total_vaccinations_per_hundred	people_vaccinated_per_hundred	people_fully_vaccinated_per_hundred	daily_vaccinations_per_million	vaccines	source_name	source_website
0	Afghanistan	AFG	2021-02-22	0.0	0.0	NaN	NaN	NaN	0.00	0.00	NaN	NaN	Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...	World Health Organization	https://covid19.who.int/
1	Afghanistan	AFG	2021-02-23	NaN	NaN	NaN	NaN	1367.0	NaN	NaN	NaN	35.0	Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...	World Health Organization	https://covid19.who.int/
2	Afghanistan	AFG	2021-02-24	NaN	NaN	NaN	NaN	1367.0	NaN	NaN	NaN	35.0	Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...	World Health Organization	https://covid19.who.int/
3	Afghanistan	AFG	2021-02-25	NaN	NaN	NaN	NaN	1367.0	NaN	NaN	NaN	35.0	Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...	World Health Organization	https://covid19.who.int/
4	Afghanistan	AFG	2021-02-26	NaN	NaN	NaN	NaN	1367.0	NaN	NaN	NaN	35.0	Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...	World Health Organization	https://covid19.who.int/
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
17602	Zimbabwe	ZWE	2021-05-08	657838.0	509274.0	148564.0	17076.0	19648.0	4.43	3.43	1.00	1322.0	Sinopharm/Beijing	Ministry of Health	https://twitter.com/MoHCCZim/status/1392575941...
17603	Zimbabwe	ZWE	2021-05-09	684243.0	526066.0	158177.0	26405.0	22863.0	4.60	3.54	1.06	1538.0	Sinopharm/Beijing	Ministry of Health	https://twitter.com/MoHCCZim/status/1392575941...
17604	Zimbabwe	ZWE	2021-05-10	690653.0	529360.0	161293.0	6410.0	21877.0	4.65	3.56	1.09	1472.0	Sinopharm/Beijing	Ministry of Health	https://twitter.com/MoHCCZim/status/1392575941...
17605	Zimbabwe	ZWE	2021-05-11	709772.0	539526.0	170246.0	19119.0	21428.0	4.78	3.63	1.15	1442.0	Sinopharm/Beijing	Ministry of Health	https://twitter.com/MoHCCZim/status/1392575941...
17606	Zimbabwe	ZWE	2021-05-12	730365.0	549797.0	180568.0	20593.0	22019.0	4.91	3.70	1.21	1481.0	Sinopharm/Beijing	Ministry of Health	https://twitter.com/MoHCCZim/status/1392575941...

	date	country	cumulative_total_cases	daily_new_cases	active_cases	cumulative_total_deaths	daily_new_deaths
0	2020-2-15	Afghanistan	0.0	NaN	0.0	0.0	NaN
1	2020-2-16	Afghanistan	0.0	NaN	0.0	0.0	NaN
2	2020-2-17	Afghanistan	0.0	NaN	0.0	0.0	NaN
3	2020-2-18	Afghanistan	0.0	NaN	0.0	0.0	NaN
4	2020-2-19	Afghanistan	0.0	NaN	0.0	0.0	NaN
...	...	...	...	...	...	...	...
99459	2021-5-07	Zimbabwe	38403.0	5.0	786.0	1576.0	1.0
99460	2021-5-08	Zimbabwe	38414.0	11.0	786.0	1576.0	0.0
99461	2021-5-09	Zimbabwe	38419.0	5.0	780.0	1576.0	0.0
99462	2021-5-10	Zimbabwe	38433.0	14.0	649.0	1576.0	0.0
99463	2021-5-11	Zimbabwe	38448.0	15.0	648.0	1579.0	3.0

	date	country	cumulative_total_cases	daily_new_cases	active_cases	cumulative_total_deaths	daily_new_deaths
373	2021-2-22	AFGHANISTAN	55646.0	29.0	4316.0	2435.0	2.0
374	2021-2-23	AFGHANISTAN	55664.0	18.0	4261.0	2436.0	1.0
375	2021-2-24	AFGHANISTAN	55680.0	16.0	4156.0	2438.0	2.0
376	2021-2-25	AFGHANISTAN	55696.0	16.0	3973.0	2442.0	4.0
377	2021-2-26	AFGHANISTAN	55707.0	11.0	3979.0	2443.0	1.0
...	...	...	...	...	...	...	...
99459	2021-5-07	ZIMBABWE	38403.0	5.0	786.0	1576.0	1.0
99460	2021-5-08	ZIMBABWE	38414.0	11.0	786.0	1576.0	0.0
99461	2021-5-09	ZIMBABWE	38419.0	5.0	780.0	1576.0	0.0
99462	2021-5-10	ZIMBABWE	38433.0	14.0	649.0	1576.0	0.0
99463	2021-5-11	ZIMBABWE	38448.0	15.0	648.0	1579.0	3.0

	country	iso_code	date	total_vaccinations	people_vaccinated	people_fully_vaccinated	daily_vaccinations_raw	daily_vaccinations	total_vaccinations_per_hundred	people_vaccinated_per_hundred	people_fully_vaccinated_per_hundred	daily_vaccinations_per_million	vaccines	source_name	source_website
0	AFGHANISTAN	AFG	2021-02-22	0.0	0.0	NaN	NaN	NaN	0.00	0.00	NaN	NaN	Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...	World Health Organization	https://covid19.who.int/
1	AFGHANISTAN	AFG	2021-02-23	NaN	NaN	NaN	NaN	1367.0	NaN	NaN	NaN	35.0	Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...	World Health Organization	https://covid19.who.int/
2	AFGHANISTAN	AFG	2021-02-24	NaN	NaN	NaN	NaN	1367.0	NaN	NaN	NaN	35.0	Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...	World Health Organization	https://covid19.who.int/
3	AFGHANISTAN	AFG	2021-02-25	NaN	NaN	NaN	NaN	1367.0	NaN	NaN	NaN	35.0	Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...	World Health Organization	https://covid19.who.int/
4	AFGHANISTAN	AFG	2021-02-26	NaN	NaN	NaN	NaN	1367.0	NaN	NaN	NaN	35.0	Oxford/AstraZeneca, Pfizer/BioNTech, Sinopharm...	World Health Organization	https://covid19.who.int/
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
17601	ZIMBABWE	ZWE	2021-05-07	640762.0	500422.0	140340.0	33407.0	20060.0	4.31	3.37	0.94	1350.0	Sinopharm/Beijing	Ministry of Health	https://twitter.com/MoHCCZim/status/1392575941...
17602	ZIMBABWE	ZWE	2021-05-08	657838.0	509274.0	148564.0	17076.0	19648.0	4.43	3.43	1.00	1322.0	Sinopharm/Beijing	Ministry of Health	https://twitter.com/MoHCCZim/status/1392575941...
17603	ZIMBABWE	ZWE	2021-05-09	684243.0	526066.0	158177.0	26405.0	22863.0	4.60	3.54	1.06	1538.0	Sinopharm/Beijing	Ministry of Health	https://twitter.com/MoHCCZim/status/1392575941...
17604	ZIMBABWE	ZWE	2021-05-10	690653.0	529360.0	161293.0	6410.0	21877.0	4.65	3.56	1.09	1472.0	Sinopharm/Beijing	Ministry of Health	https://twitter.com/MoHCCZim/status/1392575941...
17605	ZIMBABWE	ZWE	2021-05-11	709772.0	539526.0	170246.0	19119.0	21428.0	4.78	3.63	1.15	1442.0	Sinopharm/Beijing	Ministry of Health	https://twitter.com/MoHCCZim/status/1392575941...

Testing Vaccine Efficacy Against COVID¶

By Kashif Peshimam¶

Introduction¶

1. Data Collection¶

2. Data Management¶

3. Exploratory Data Analysis (EDA)¶

4. Hypothesis Testing¶

Machine Learning¶

5. Conclusion¶