すべてのコードをまとめて掲載
最後にこれまで解説したすべてのコードをまとめて掲載しましたので参考にしてください。
厚労省のサイトからCOVID-19のCSVファイルを取り込んでクリーンナップする。
Version 1
# Article013_Pandas Merge COVIC19 files Japan by ken(1).py V1
# Data Source: https://www.mhlw.go.jp/stf/covid-19/open-data.html
# csv files:
# newly_confirmed_cases_daily.csv
# requiring_inpatient_care_etc_daily.csv
# deaths_cumulative_daily.csv
# severe_cases_daily.csv
# pcr_tested_daily.csv
# latlng_data_center.csv
# %%
# Merge COVID-19 Data Files
# Import the necessary libraries
import pandas as pd
from pandas.core.frame import DataFrame
import pandas_datareader.data as web # pip install pandas_datareader
import matplotlib.pyplot as plt
import matplotlib.style as style
import numpy as np
import warnings
warnings.simplefilter('ignore')
# %%
# 1) Load the csv files
csv1 = 'https://covid19.mhlw.go.jp/public/opendata/newly_confirmed_cases_daily.csv'
raw1 = pd.read_csv(csv1)
csv2 = 'https://www.mhlw.go.jp/content/pcr_tested_daily.csv' # Exclude Prefecture (ken)
raw2 = pd.read_csv(csv2)
csv3 = 'https://covid19.mhlw.go.jp/public/opendata/deaths_cumulative_daily.csv'
raw3 = pd.read_csv(csv3)
csv4 = 'https://covid19.mhlw.go.jp/public/opendata/severe_cases_daily.csv'
raw4 = pd.read_csv(csv4)
csv5 = 'data/csv/covid-19/japan/latlng_data_center.csv'
raw5 = pd.read_csv(csv5)
raw1.to_csv('data/csv/covid-19/japan/newly_confirmed_cases_daily.csv', index=False)
raw2.to_csv('data/csv/covid-19/japan/pcr_tested_daily.csv', index=False)
raw3.to_csv('data/csv/covid-19/japan/deaths_cumulative_daily.csv', index=False)
raw4.to_csv('data/csv/covid-19/japan/severe_cases_daily.csv', index=False)
print('1) Load the csv files completed!')
# %%
# 2) Rename Column names
raw1.set_axis(['date', 'prefecture', 'new_cases'], axis=1, inplace=True)
raw2.set_axis(['date', 'new_tests'], axis=1, inplace=True)
raw3.set_axis(['date', 'prefecture','total_deaths'], axis=1, inplace=True)
raw4.set_axis(['date', 'prefecture','severe_cases'], axis=1, inplace=True)
# 2-1) Convert str to datetime
raw1['date'] = pd.to_datetime(raw1['date'])
raw2['date'] = pd.to_datetime(raw2['date'])
raw3['date'] = pd.to_datetime(raw3['date'])
raw4['date'] = pd.to_datetime(raw4['date'])
# 3) Add a file_id field
raw1['file_id'] = 'new_cases'
raw2['file_id'] = 'new_tests'
raw3['file_id'] = 'tot_deaths'
raw4['file_id'] = 'new_severe_cases'
raw5['file_id'] = 'latlng_data'
print('2-3) Rename Column names / Convert date / Add a file id completed!')
# %%
# 4) Add daily or total fields
# 4-1) raw1 newly_confirmed_cases_daily.csv : add a total_cases field
prefs = raw1['prefecture'].unique().tolist()
#print(prefs)
del prefs[0] # Remove 'ALL' from the list
#print(prefs)
#prefs = ['Aichi']
for pref in prefs:
filter_by = (raw1['prefecture'] == pref)
dfx = raw1[filter_by]
dfx.sort_values(by=['date'], inplace=True)
dfx['total_cases'] = dfx['new_cases'].cumsum()
#csv = f'data/csv/covid-19/japan/newly_confirmed_cases_daily({pref}).csv'
#dfx.to_csv(csv, index=False)
for index, row in dfx.iterrows():
filter_by = (raw1['prefecture'] == row.prefecture) & (raw1['date'] == row.date)
raw1.loc[filter_by, 'total_cases'] = row.total_cases
#raw1.to_csv('data/csv/covid-19/japan/newly_confirmed_cases_daily(1).csv', index=False)
#print(raw1.tail())
print('4-1) Add a total_cases field complete!')
# %%
# 4-2) raw4 severe_cases_daily.csv : add a total_severe_cases field
prefs = raw4['prefecture'].unique().tolist()
#print(prefs)
del prefs[0] # Remove 'ALL' from the list
#print(prefs)
#prefs = ['Aichi']
for pref in prefs:
filter_by = (raw4['prefecture'] == pref)
dfx = raw4[filter_by]
dfx.sort_values(by=['date'], inplace=True)
dfx['total_severe_cases'] = dfx['severe_cases'].cumsum()
#csv = f'data/csv/covid-19/japan/severe_cases_daily({pref}).csv'
#dfx.to_csv(csv, index=False)
for index, row in dfx.iterrows():
filter_by = (raw4['prefecture'] == row.prefecture) & (raw4['date'] == row.date)
raw4.loc[filter_by, 'total_severe_cases'] = row.total_severe_cases
#raw4.to_csv('data/csv/covid-19/japan/severe_cases_daily(1).csv', index=False)
#print(raw4.tail())
print('4-2) Add a total_severe_cases field complete!')
# %%
# 4-3) raw3 deaths_cumulative_daily.csv : add a new_deahts filed
prefs = raw3['prefecture'].unique().tolist()
#print(prefs)
del prefs[0] # Remove 'ALL' from the list
#print(prefs)
#prefs = ['Aichi']
for pref in prefs:
filter_by = (raw3['prefecture'] == pref)
dfx = raw3[filter_by]
dfx.sort_values(by=['date'], inplace=True)
dfx['new_deaths'] = dfx['total_deaths'].diff()
first_date = dfx['date'].min()
filter_by = (dfx['date'] == first_date)
dfy = dfx[filter_by]
filter_by = ( dfx['date'] == first_date )
dfx.loc[filter_by, 'new_deaths'] = dfy['total_deaths']
#csv = f'data/csv/covid-19/japan/deaths_cumulative_daily({pref}).csv'
#dfx.to_csv(csv, index=False)
for index, row in dfx.iterrows():
filter_by = (raw3['prefecture'] == row.prefecture) & (raw3['date'] == row.date)
raw3.loc[filter_by, 'new_deaths'] = row.new_deaths
#raw3.to_csv('data/csv/covid-19/japan/deaths_cumulative_daily(1).csv', index=False)
print('4-3) Add a new_deaths field complete!')
# %%
# 5) Merge the CSV files
raw = pd.concat([raw1, raw2], ignore_index=True, axis=0)
raw = pd.concat([raw, raw3], ignore_index=True, axis=0)
raw = pd.concat([raw, raw4], ignore_index=True, axis=0)
raw = pd.concat([raw, raw5], ignore_index=True, axis=0)
print('5) Merge the CSV files completed!')
# %%
# 6) Replace all NaN values with 2019/1/1, N/A, 0
replace_values = {
'date':'2019/1/1',
'prefecture': 'N/A',
'new_cases': 0,
'severe_cases': 0,
'new_tests': 0,
'new_deaths': 0,
'total_cases': 0,
'total_severe_cases': 0,
'total_deaths': 0,
'lan': 0, 'long': 0
}
raw.fillna(value=replace_values, inplace=True)
raw.isnull().sum().sum()
print('6) Replace all NaN values with 2019/1/1, N/A, 0 completed!')
# Drop rows if the prefecture == 'ALL'
index_no = raw[ raw['prefecture'] == 'ALL' ].index
raw.drop(index_no, inplace=True)
#raw.head()
# %%
# 7) Update all lan, long if value is 0
raw5.set_index('prefecture', inplace=True)
cnt = 0
for index, row in raw5.iterrows():
#print(f'Update raw {index}, lan={row.lan}, long={row.long}')
raw.loc[ raw['prefecture'] == index, ['lan', 'long'] ] = [row.lan, row.long]
#cnt += 1
#if cnt > 5:
# break
raw5.reset_index(inplace=True)
#csv_file = 'data/csv/covid-19/japan/combined(2).csv'
#raw.to_csv(csv_file, index=False)
print('7) Merging of csv files has been completed!')
# %%
# 8-1) raw4 severe_cases_daily.csv : update total_severe_cases, severe_cases fields for raw
prefs = raw4['prefecture'].unique().tolist()
#print(prefs)
for pref in prefs:
filter_by = (raw4['prefecture'] == pref)
dfx = raw4[filter_by]
dfx.sort_values(by=['date'], inplace=True)
for index, row in dfx.iterrows():
filter_by = (raw['file_id'] == 'new_cases') & (raw['prefecture'] == row.prefecture) & (raw['date'] == row.date)
raw.loc[filter_by, ['total_severe_cases', 'severe_cases']] = [row.total_severe_cases, row.severe_cases]
#print(raw4.tail())
print('8-1) Update total_severe_cases / severe_case fields completed!')
# %%
# 8-2) raw3 deaths_cumulative_daily.csv : update total_deaths, new_deaths fields for raw
prefs = raw3['prefecture'].unique().tolist()
#print(prefs)
for pref in prefs:
filter_by = (raw3['prefecture'] == pref)
dfx = raw3[filter_by]
dfx.sort_values(by=['date'], inplace=True)
for index, row in dfx.iterrows():
filter_by = (raw['file_id'] == 'new_cases' ) & (raw['prefecture'] == row.prefecture) & (raw['date'] == row.date)
raw.loc[filter_by, ['total_deaths', 'new_deaths']] = [row.total_deaths, row.new_deaths]
print('8-2) Update total_deaths /new_deaths fields completed!')
csv_file = 'data/csv/covid-19/japan/combined.csv'
raw.to_csv(csv_file, index=False)
print('9) Merging of csv files has been completed!')
# %%
厚労省のサイトからCOVID-19のCSVファイルを取り込んでクリーンナップする。
Version 2
# Article013_Pandas Merge COVIC19 files Japan by ken(2).py V2
# Data Source: https://www.mhlw.go.jp/stf/covid-19/open-data.html
# csv files:
# newly_confirmed_cases_daily.csv
# requiring_inpatient_care_etc_daily.csv
# deaths_cumulative_daily.csv
# severe_cases_daily.csv
# pcr_tested_daily.csv
# latlng_data_center.csv
# %%
# 0) Merge COVID-19 Data Files
# Import the necessary libraries
import pandas as pd
from pandas.core.frame import DataFrame
import pandas_datareader.data as web # pip install pandas_datareader
import matplotlib.pyplot as plt
import matplotlib.style as style
import numpy as np
import warnings
warnings.simplefilter('ignore')
# %%
# 1) Load the csv files
csv1 = 'https://covid19.mhlw.go.jp/public/opendata/newly_confirmed_cases_daily.csv'
raw1 = pd.read_csv(csv1)
csv2 = 'https://www.mhlw.go.jp/content/pcr_tested_daily.csv' # Exclude Prefecture (ken)
raw2 = pd.read_csv(csv2)
csv3 = 'https://covid19.mhlw.go.jp/public/opendata/deaths_cumulative_daily.csv'
raw3 = pd.read_csv(csv3)
csv4 = 'https://covid19.mhlw.go.jp/public/opendata/severe_cases_daily.csv'
raw4 = pd.read_csv(csv4)
raw1.to_csv('data/csv/covid-19/japan/newly_confirmed_cases_daily.csv', index=False)
raw2.to_csv('data/csv/covid-19/japan/pcr_tested_daily.csv', index=False)
raw3.to_csv('data/csv/covid-19/japan/deaths_cumulative_daily.csv', index=False)
raw4.to_csv('data/csv/covid-19/japan/severe_cases_daily.csv', index=False)
print('1) Load the csv files completed!')
# %%
# 2) Rename Column names
raw1.set_axis(['date', 'prefecture', 'new_cases'], axis=1, inplace=True)
raw3.set_axis(['date', 'prefecture','total_deaths'], axis=1, inplace=True)
raw4.set_axis(['date', 'prefecture','severe_cases'], axis=1, inplace=True)
# 2-1) Convert str to datetime
raw1['date'] = pd.to_datetime(raw1['date'])
raw3['date'] = pd.to_datetime(raw3['date'])
raw4['date'] = pd.to_datetime(raw4['date'])
# 2-2) Remove ALL rows
filter_by = (raw1['prefecture'] == 'ALL')
#print(raw1[filter_by].index)
raw1.drop(raw1[filter_by].index, inplace=True)
filter_by = (raw3['prefecture'] == 'ALL')
#print(raw3[filter_by].index)
raw3.drop(raw3[filter_by].index, inplace=True)
filter_by = (raw4['prefecture'] == 'ALL')
#print(raw4[filter_by].index)
raw4.drop(raw4[filter_by].index, inplace=True)
print('2) Rename Column names / Convert date / Remove AlL completed!')
# %%
# 3) Add daily or total fields
# 3-1) raw1 newly_confirmed_cases_daily.csv : add a total_cases field
prefs = raw1['prefecture'].unique().tolist()
#print(prefs)
del prefs[0] # Remove 'ALL' from the list
#print(prefs)
#prefs = ['Aichi','Akita']
dfx_prefs = []
for pref in prefs:
filter_by = (raw1['prefecture'] == pref)
df = raw1[filter_by]
df.sort_values(by=['prefecture','date'], inplace=True) # sort by prefecture, date
dfx = df[['prefecture','date','new_cases']] # select prefecture, date, new_cases
dfx['total_cases'] = df['new_cases'].cumsum() # calculate total_cases by date
#csv = f'data/csv/covid-19/japan/newly_confirmed_cases_daily_dfx({pref}).csv'
#dfx.to_csv(csv, index=False)
dfx_prefs.append(dfx)
raw1x = pd.concat(dfx_prefs, ignore_index=True, axis=0)
raw1x.to_csv('data/csv/covid-19/japan/newly_confirmed_cases_daily(0).csv', index=False)
#print(raw1.tail())
print('3-1) Add a total_cases field complete!')
# %%
# 3-2) raw4 severe_cases_daily.csv : add a total_severe_cases field
prefs = raw4['prefecture'].unique().tolist()
#print(prefs)
del prefs[0] # Remove 'ALL' from the list
#print(prefs)
#prefs = ['Aichi']
dfx_prefs = []
for pref in prefs:
filter_by = (raw4['prefecture'] == pref)
df = raw4[filter_by]
df.sort_values(by=['date'], inplace=True) # sort by prefecture, date
dfx = df[['prefecture','date','severe_cases']] # select prefecture, date, severe_cases
dfx['total_severe_cases'] = df['severe_cases'].cumsum() # calculate total_severe_cases
#csv = f'data/csv/covid-19/japan/severe_cases_daily({pref}).csv'
#dfx.to_csv(csv, index=False)
dfx_prefs.append(dfx)
raw4x = pd.concat(dfx_prefs, ignore_index=True, axis=0)
raw4x.to_csv('data/csv/covid-19/japan/severe_cases_daily(0).csv', index=False)
#print(raw4.tail())
print('3-2) Add a total_severe_cases field complete!')
# %%
# 3-3) raw3 deaths_cumulative_daily.csv : add a new_deahts filed
prefs = raw3['prefecture'].unique().tolist()
#print(prefs)
del prefs[0] # Remove 'ALL' from the list
#print(prefs)
#prefs = ['Aichi']
dfx_prefs = []
for pref in prefs:
filter_by = (raw3['prefecture'] == pref)
df = raw3[filter_by]
df.sort_values(by=['date'], inplace=True)
dfx = df[['prefecture','date','total_deaths']] # select prefecture, date, total_deaths
dfx['new_deaths'] = df['total_deaths'].diff() # calculate new_deaths
first_date = dfx['date'].min()
filter_by = (dfx['date'] == first_date)
dfy = dfx[filter_by]
filter_by = ( dfx['date'] == first_date )
dfx.loc[filter_by, 'new_deaths'] = dfy['total_deaths']
#csv = f'data/csv/covid-19/japan/deaths_cumulative_daily({pref}).csv'
#dfx.to_csv(csv, index=False)
dfx_prefs.append(dfx)
raw3x = pd.concat(dfx_prefs, ignore_index=True, axis=0)
raw3x.to_csv('data/csv/covid-19/japan/deaths_cumulative_daily(0).csv', index=False)
print('3-3) Add a new_deaths field complete!')
# %%
# 4) Merge the CSV files
raw = pd.merge(raw1x, raw3x, how='outer')
raw = pd.merge(raw, raw4x, how='outer')
csv_file = 'data/csv/covid-19/japan/combined_v2(merge).csv'
raw.to_csv(csv_file, index=False)
print('4) Merge the CSV files completed!')
# %%
# 5) Replace all NaN values with 2019/1/1, N/A, 0
replace_values = {
'date':'2019/1/1',
'prefecture': 'N/A',
'new_cases': 0,
'severe_cases': 0,
'new_tests': 0,
'new_deaths': 0,
'total_cases': 0,
'total_severe_cases': 0,
'total_deaths': 0
}
raw.fillna(value=replace_values, inplace=True)
raw.isnull().sum().sum()
print('5) Replace all NaN values with 2019/1/1, N/A, 0 completed!')
# %%
# 6) Drop rows if the prefecture == 'ALL'
index_no = raw[ raw['prefecture'] == 'ALL' ].index
raw.drop(index_no, inplace=True)
#raw.head()
print('6) Drop rows if the prefecture == ALL completed!')
# %%
# 7) Create a merged csv file
csv_file = 'data/csv/covid-19/japan/combined_v2.csv'
raw.to_csv(csv_file, index=False)
print('7) Merging of csv files has been completed!')
# %%
一元化したCOVID-19のデータを解析する。
# Data Source: https://www.mhlw.go.jp/stf/covid-19/open-data.html
# csv files: data/csv/covid-19/japan/combined.csv
# %%
# Grouping & Pivoting in Pandas
# Import the necessary libraries
import pandas as pd
#from pandas.core.frame import DataFrame
#import pandas_datareader.data as web # pip install pandas_datareader
import matplotlib.pyplot as plt
import matplotlib.style as style
import numpy as np
# 1) Load the csv file cases
csv_file = 'data/csv/covid-19/japan/combined.csv'
raw = pd.read_csv(csv_file)
df = raw
# Convert str to datetime
df['date'] = pd.to_datetime(df['date'])
# %%
# Set the font to support Japanese
plt.rcParams['font.family'] = 'Yu Gothic' # Meiryo, Yu Gothic
# 1) Draw a pie chart for new_cases
# Get to see which prefectures have the most cases currently
desc_prefs = df.groupby('prefecture')['new_cases'].sum().sort_values(ascending=False)
desc_prefs
# Generate lables and values for the pie chart
labels = []
values = []
pref_count = 5 # 5+1=6
other_total = 0
for pref in desc_prefs.index:
if pref_count > 0:
labels.append(pref)
values.append(desc_prefs[pref])
pref_count -= 1
else:
other_total += desc_prefs[pref]
labels.append("Other")
values.append(other_total)
# Draw a pie cahrt
wedge_dict = {
'edgecolor': 'black', # black
'linewidth': 2 # 2
}
explode = (0, 0, 0, 0.1, 0, 0) # 6
plt.figure(figsize=(10,8))
plt.pie(
values,
labels=labels,
explode=explode,
autopct='%1.1f%%',
textprops={'color':'w'},
wedgeprops=wedge_dict)
plt.title('COVID-19 Cases\n(発症者数)', color='w', fontsize=18)
#plt.legend(fontsize=15, loc='center right')
plt.legend(
fontsize=16,
labels=['東京','大阪','神奈川','埼玉','愛知','その他'] ,
loc='center right',
bbox_to_anchor=(1, 0, 0.5, 1)
)
# %%
# 2) Draw a pie chart for severe_cases
# Get to see which prefectures have the most severe cases currently
desc_prefs = df.groupby('prefecture')['severe_cases'].sum().sort_values(ascending=False)
desc_prefs
# Generate lables and values for the pie chart
labels = []
values = []
pref_count = 5 # 5+1=6
other_total = 0
for pref in desc_prefs.index:
if pref_count > 0:
labels.append(pref)
values.append(desc_prefs[pref])
pref_count -= 1
else:
other_total += desc_prefs[pref]
labels.append("Other")
values.append(other_total)
# Draw a pie cahrt
wedge_dict = {
'edgecolor': 'black',
'linewidth': 2
}
explode = (0, 0, 0, 0, 0.1, 0) # 6
plt.figure(figsize=(10,8))
plt.pie(
values,
labels=labels,
explode=explode,
autopct='%1.1f%%',
textprops={'color':'w'},
wedgeprops=wedge_dict)
plt.title('COVID-19 Severe Cases\n(重症者数)', color='w', fontsize=18)
#plt.legend(fontsize=15, loc='center right')
plt.legend(
fontsize=16,
labels=['大阪','東京','神奈川','兵庫','埼玉','その他'] ,
loc='center right',
bbox_to_anchor=(1, 0, 0.5, 1)
)
# %%
# 3) Draw a pie chart for total_deaths
# Get last date to see which prefectures have the most deaths currently
last_date = df['date'].max()
df_last_date = df[ df['date'] == last_date ]
desc_prefs = df_last_date.groupby('prefecture')['total_deaths'].sum().sort_values(ascending=False)
desc_prefs
# Generate lables and values for the pie chart
labels = []
values = []
pref_count = 7 # 7+1=8
other_total = 0
for pref in desc_prefs.index:
if pref_count > 0:
labels.append(pref)
values.append(desc_prefs[pref])
pref_count -= 1
else:
other_total += desc_prefs[pref]
labels.append("Other")
values.append(other_total)
# Draw a pie cahrt
wedge_dict = {
'edgecolor': 'black',
'linewidth': 2
}
explode = (0, 0, 0, 0, 0, 0, 0.1, 0) # 8
plt.figure(figsize=(10,8))
plt.pie(
values,
labels=labels,
explode=explode,
autopct='%1.1f%%',
textprops={'color':'w'},
wedgeprops=wedge_dict)
plt.title('COVID-19 Total Deaths\n(累計死亡者数)', color='w', fontsize=18)
#plt.legend(fontsize=15, loc='center right')
plt.legend(
fontsize=16,
labels=['大阪','東京','北海道','兵庫','神奈川','愛知','埼玉','その他'] ,
loc='center right',
bbox_to_anchor=(1, 0, 0.5, 1)
)