Newer
Older
"""Data Acquisition for DWD global monthly Air Temperature
Author: Peter Morstein
"""
import pandas as pd
from ftplib import FTP
import numpy as np
#import ExportToWorldShape as exportToWorldShape
import ExportToDatabase as exportToDatabase
"""
example files:
dwd gauge monthly mean: https://opendata.dwd.de/climate_environment/CDC/observations_global/CLIMAT/monthly/qc/air_temperature_mean/historical/10961_195301_201812.txt
"""
stationURL = "https://opendata.dwd.de/climate_environment/CDC/help/stations_list_CLIMAT_data.txt"
dwdFtpServer = "opendata.dwd.de"
dwdFtpUri = "/climate_environment/CDC/observations_global/CLIMAT/monthly/qc/air_temperature_mean/historical/"
countryAnnualTemp = pd.DataFrame([])
# load the all available stations from DWD service
# @return: complete list of available dwd stations
stationList = pd.read_csv(stationURL, delimiter=";", skiprows=0, usecols=[0,2,3,5], names=["id","lat","lon","country"], header=0, encoding="ISO-8859-1 ")
stationList = stationList.dropna(how="any", axis=0)
stationList['country'] = stationList['country'].str.strip()
stationList['lon'] = stationList['lon'].str.strip()
stationList['lat'] = stationList['lat'].str.strip()
def filterDWDGauges():
global stationList
stationList = stationList.loc[stationList['country'] == "Germany"]
stationList['lat'] = pd.to_numeric(stationList['lat'], errors='coerce', downcast='float')
stationList = stationList.loc[stationList['lat']>40]
# load station file names from DWD an join the filename with the stationList
def loadAndJoinDWDClimateFilenames():
global stationList
print("load dwd climate filenames")
# load climate files from dwd
dwdFTP = FTP(dwdFtpServer)
dwdFTP.login()
dwdFTP.cwd(dwdFtpUri)
fileList = pd.DataFrame({'id':[],"file":[]})
ftpIds = []
ftpFileNames = []
gaugeID = file_name.split("_")[0]
if gaugeID in stationList["id"].tolist():
ftpFileNames.append(file_name)
ftpIds.append(file_name.split("_")[0])
fileList = pd.DataFrame({'id':ftpIds,"file":ftpFileNames})
ftpIds.clear()
ftpFileNames.clear()
dwdFTP.quit()
# filter climate files list by longest timeseries
# (because: there are multiple timeseries-files per station with same historical values)
longestSeries = pd.DataFrame()
for index, ftpFiles in fileList.groupby("id", axis=0):
longestSeries = longestSeries.append(ftpFiles.iloc[-1])
fileList.drop(fileList.index, inplace=True)
# concat climate files with station list
stationList = stationList.set_index("id").join(longestSeries.set_index("id"), on="id")
stationList = stationList.dropna(axis=0, how="any")
stationList = stationList[stationList.country!=""]
# with open("stationList.pickle","wb") as pf:
# pickle.dump(stationList, pf)
# here we have to try some interpolations for missing values
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def fillMissingData(annualData):
months = ["Jan", "Feb", "Mrz","Apr","Mai","Jun","Jul","Aug","Sep","Okt","Nov","Dez"]
for y in range(0,len(annualData)):
# check month for nan values
for m in range(0,len(months)):
#print(annualData.iloc[y].loc[months[m]])
if np.isnan(annualData.iloc[y].loc[months[m]]):
prevYear = None
nextYear = None
prevMonth = m-1
nextMonth = m+1
if y >= 1:
prevYear = y-1
if y < len(annualData)-1:
nextYear = y+1
averageList = []
if prevYear != None:
averageList.append(annualData.iloc[prevYear].loc[months[m]])
if nextYear != None:
averageList.append(annualData.iloc[nextYear].loc[months[m]])
if prevMonth >= 0:
averageList.append(annualData.iloc[y].loc[months[prevMonth]])
if prevMonth < 0 and prevYear != None:
prevMonth = len(months)-1
averageList.append(annualData.iloc[prevYear].loc[months[prevMonth]])
if nextMonth < len(months):
averageList.append(annualData.iloc[y].loc[months[nextMonth]])
if nextMonth >= len(months) and nextYear!=None:
nextMonth = 0
averageList.append(annualData.iloc[nextYear].loc[months[nextMonth]])
annualData.iat[y,m] = np.round(np.nanmean(averageList),2)
annualData["mean"] = np.round(annualData.iloc[:,0:11].mean(axis=1,skipna=True),2)
return annualData
def loadTemperatureFromDWDGauges():
global climateCountry
global stationList
global annualData
global worldTemperature
for index, gaugeCountry in stationList.groupby("country", axis=0):
print(index,": ",len(gaugeCountry.country)," gauges to load")
gaugeURLs = "https://"+dwdFtpServer+dwdFtpUri+gaugeCountry.file
gaugeIds = gaugeCountry.index
for gid, gurl in zip(gaugeIds, gaugeURLs):
annualData = pd.read_csv(gurl, delimiter=";")
annualData = annualData.set_index("Jahr")
annualData["mean"] = annualData.mean(axis=1)
#annualData = fillMissingData(annualData)
for dataIndex, annualMean in annualData.iterrows():
try:
stationList.at[gid, dataIndex] = annualMean["mean"]
except:
continue
if i % round((len(gaugeCountry.country) / 10)) == 0:
finished = i/len(gaugeCountry.country) * 100
print(np.round(finished), end="% ... ")
if i == (len(gaugeCountry.country)):
print('', end=' 100%, Done. \n')
stationList.columns = stationList.columns.astype(str)
stationList = stationList.sort_index(axis=1, ascending=False)
loadAndJoinDWDClimateFilenames()
loadTemperatureFromDWDGauges()
# with open("./pickle/stationList_germany.pickle", "wb") as pickleFile:
# pickle.dump(stationList, pickleFile)
# stationList = pickle.load(pickleFile)
# stationList = pd.read_pickle('./pickle/stationList_germany.pickle')
# export station list to different outputs
#exportToWorldShape.export(stationList)