Post-processing of Global hourly weather data from Integrated Surface Data (ISD)
High-resolution data (temporal scale) are important to observe discernable change and to retrieve meaningful information in climate analysis. The finest the temporal resolution, better is the result and more biases removed.
This post seeks to shed light on how to download, cleaning and reformatting hourly weather data from the Integrated Surface Data (ISD).
This hourly data is provided by the US National Centers for Environmental Information (NCEI). This repository contains at least 6 mandatory variables on an hourly basis from various weather stations around the world. Namely:
1. WIND-OBSERVATION (WND).
2. SKY-CONDITION-OBSERVATION (CIG).
3. VISIBILITY-OBSERVATION (VIS).
4. AIR-TEMPERATURE-OBSERVATION (TMP).
5. AIR-TEMPERATURE-OBSERVATION dew point (DEW).
6. AIR-PRESSURE-OBSERVATION sea level pressure (SLP).
The hourly data are accessible from the following link: https://www.ncei.noaa.gov/data/global-hourly/access/
Load packages
The followings packages are required.
Filter weather stations information in a particular country
ctry_name='BENIN' # specify country name
stations=unique(inventory[which(inventory$COUNTRY_NAME==ctry_name),1:4])
stations
Quick view on raw file from ISD
I will consider the station “65306-099999”, 2016 data.
example_link="https://www.ncei.noaa.gov/data/global-hourly/access/2016/65306099999.csv"
curl_download(example_link,"65306099999_2016.csv")
data.table(read.csv('65306099999_2016.csv')[,c(1:2,11:16)])
As you can see, in the display above, the saving manner of the data and the protocol of storage does not help in using it directly. Therefore some post processing on the data is required. In order to be converted in the normal format for further analysis.
This post aims at providing a constructed functions dedicated to reformat the data.
# Function for processing mandatory variables fron NCEI database
pros_mandatory_var=function(stat_dat){
#convert colunm Date from character to Datetime format
stat_dat$DATE<-ymd_hms(stat_dat$DATE)
# subtract Mandatory Data and quality code
spl=data.table(
as.numeric(substr(stat_dat$TMP,1,5)),substr(stat_dat$TMP,7,9),#Temperature (TEMP)
as.numeric(substr(stat_dat$DEW,1,5)),substr(stat_dat$DEW,7,7),#AIR-TEMPERATURE-OBSERVATION dew point temperature
as.numeric(substr(stat_dat$WND,1,3)),substr(stat_dat$WND,5,5),substr(stat_dat$WND,7,7),#Wind direction (WD) and Wind type
as.numeric(substr(stat_dat$WND,9,12)),substr(stat_dat$WND,14,14),#Wind speed rate (WSR)
as.numeric(substr(stat_dat$CIG,1,5)),substr(stat_dat$CIG,7,7),#SKY-CONDITION-OBSERVATION ceiling height
substr(stat_dat$CIG,9,9),substr(stat_dat$CIG,11,11),#SKY-CONDITION-OBSERVATION ceiling determination code
as.numeric(substr(stat_dat$VIS,1,6)),substr(stat_dat$VIS,8,8),#VISIBILITY-OBSERVATION distance dimension
substr(stat_dat$VIS,10,10),substr(stat_dat$VIS,12,12),#VISIBILITY-OBSERVATION variability code
as.numeric(substr(stat_dat$SLP,1,5)),substr(stat_dat$SLP,7,7)#ATMOSPHERIC-PRESSURE-OBSERVATION sea level pressure
)
#Rename column names
colnames(spl)<-c('TEMP','QC_TEMP',
'DEW','QC_DEW',
'WND','QC_WND','WND_TYPE',
'WSR','QC_WSR',
'CIG_H','QC_CIG_H',
'CIG_D','QC_CIG_D',
'VIS_D','QC_VIS_D',
'VIS_V','QC_VIS_V',
'SLP','QC_SLP')
#Removing the scaling factor of 10 from values
spl$TEMP<-spl$TEMP/10
spl$WSR<-spl$WSR/10
spl$DEW<-spl$DEW/10
spl$SLP<-spl$SLP/10
# Quality control and attribute NA to Suspect or Erroneous values from QC colunm
spl$TEMP[!(which(spl$QC_TEMP==1) || which(spl$QC_TEMP==5)||
which(spl$QC_TEMP=='A')|| which(spl$QC_TEMP=='C') || which(spl$QC_TEMP=='I') ||
which(spl$QC_TEMP=='M')|| which(spl$QC_TEMP=='P') || which(spl$QC_TEMP=='R') ||
which(spl$QC_TEMP=='U'))]<-NA;
spl$DEW[!(which(spl$QC_DEW==1) || which(spl$QC_DEW==5)||
which(spl$QC_DEW=='A')|| which(spl$QC_DEW=='C') || which(spl$QC_DEW=='I') ||
which(spl$QC_DEW=='M')|| which(spl$QC_DEW=='P') || which(spl$QC_DEW=='R') ||
which(spl$QC_DEW=='U'))]<-NA;
spl$WND[!(which(spl$QC_WND==1) || which(spl$QC_WND==5))]<-NA;
spl$WSR[!(which(spl$QC_WSR==1) || which(spl$QC_WSR==5))]<-NA;
spl$CIG_H[!(which(spl$QC_CIG_H==1) || which(spl$QC_CIG_H==5))]<-NA;
spl$CIG_D[!(which(spl$QC_CIG_D=='N') || which(spl$QC_CIG_D=='Y'))]<-NA;
spl$VIS_D[!(which(spl$QC_VIS_D==1) || which(spl$QC_VIS_D==5))]<-NA;
spl$VIS_V[!(which(spl$QC_VIS_V=='N') || which(spl$QC_VIS_V=='Y'))]<-NA;
spl$SLP[!(which(spl$QC_SLP==1) || which(spl$QC_SLP==5))]<-NA;
# Output data
attach(spl)
OutDt=cbind(stat_dat[,c("NAME","DATE","LATITUDE","LONGITUDE")],
TEMP,DEW,WND,WND_TYPE,WSR,CIG_H,CIG_D,VIS_D,VIS_V,SLP)
detach(spl)
return(OutDt)
}
# Function for downloading, cleaning, reformatting of global-hourly weather data based on stations and time-range
get_GHWD <-function(stationName,time_range){
#Reformat stations name to correct form
stationName=str_remove(stationName,'-')
pathbase<-"https://www.ncei.noaa.gov/data/global-hourly/access/" #default access link
export_file=NULL
for (i in 1:length(stationName)){
for (j in 1:length(time_range)){
datapath=paste(pathbase,time_range[j],'/',stationName[i],'.csv',sep='')
saved_name=paste(stationName[i],'_',time_range[j],'.csv',sep='')
req <- curl_fetch_memory(datapath)
if(req$status_code==404){
print(paste('There is no data available for the station:',
stationName[i],'for the year ',time_range[j]))
next
}
curl_download(datapath,saved_name)
dt <- read.csv(saved_name)
outfl<-pros_mandatory_var(dt)
export_file=rbind(export_file,outfl)
unlink(saved_name, recursive = TRUE) # delete file
}
}
return(export_file)
if(is.null(export_file)){print('There is no data for the entry stations names and time ranges')}
}
Post-processing of ISD data
For this exercice we will use the above stations and for a particular time-range.
stationName=stations$STNID[1:6] #select 6 stations
time_range=2018
#Download mandatory hourly variables for stations in Benin for 2018
dt=get_GHWD(stationName,time_range) #it may take sometimes depending of internet connection.
data.table(dt)
Monthly trend of min temperature per station
# Generate minimium and maximal daily temperature from hourly data
dt$DATE<-ymd_hms(dt$DATE)
dt$Day=as.Date(dt$DATE)
dt$MON=months(dt$Day)
daily_temp=dt %>% group_by(NAME,Day,MON)%>%
summarise(Tmin=min(TEMP,na.rm = TRUE),Tmax=max(TEMP,na.rm = TRUE))%>%
arrange(match(MON,month.name))
daily_temp
#Monthly mean
monmean_temp=daily_temp %>% group_by(NAME,MON)%>%
summarise(Tmin=mean(Tmin,na.rm = TRUE),Tmax=mean(Tmax,na.rm = TRUE))%>%
arrange(match(MON,month.name))
monmean_temp
Visualisation
# Set font
windowsFonts(B=windowsFont("Bookman Old Style"))
par(family = "B",par(mfrow=c(2,2)))
for(i in 1:4){
temp=daily_temp[which(daily_temp$NAME==unique(daily_temp$NAME)[i]),]
monmean=temp %>% group_by(NAME,MON)%>%
summarise(Tmin=mean(Tmin,na.rm = TRUE),Tmax=mean(Tmax,na.rm = TRUE))%>%
arrange(match(MON,month.name))
boxplot(temp$Tmin ~ month(temp$Day),
ylab="Deg C" , xlab="Months",
main=paste("Tmin - ",unique(temp$NAME)),
axes=FALSE,col='#E37831')
axis(1,at=c(1:12),labels = month.abb,las=3.5);axis(2,las=2);box()
lines(1:12,monmean$Tmin,col="#ff7f00",lwd=3)
legend(x="bottomright",legend="Monthly Mean",
cex = 0.6,lwd = 3, col = "#ff7f00",lty = 1,bty='n')
}
Thierry, thank you very much for this wonderful work. The graphs looks very interesting and informative. I will try to workout on my country and see if I can make any sense out of the available data.
ReplyDeleteGreat Mr Charles.
DeleteThis comment has been removed by the author.
ReplyDeleteVery nice brother
ReplyDelete