Climate Data Analysis: Integrated Surface Data (ISD)

Post-processing of Global hourly weather data from Integrated Surface Data (ISD)

High-resolution data (temporal scale) are important to observe discernable change and to retrieve meaningful information in climate analysis. The finest the temporal resolution, better is the result and more biases removed.

This post seeks to shed light on how to download, cleaning and reformatting hourly weather data from the Integrated Surface Data (ISD).

This hourly data is provided by the US National Centers for Environmental Information (NCEI). This repository contains at least 6 mandatory variables on an hourly basis from various weather stations around the world. Namely:

1. WIND-OBSERVATION (WND).

2. SKY-CONDITION-OBSERVATION (CIG).

3. VISIBILITY-OBSERVATION (VIS).

4. AIR-TEMPERATURE-OBSERVATION (TMP).

5. AIR-TEMPERATURE-OBSERVATION dew point (DEW).

6. AIR-PRESSURE-OBSERVATION sea level pressure (SLP).

The hourly data are accessible from the following link: https://www.ncei.noaa.gov/data/global-hourly/access/

Load packages

The followings packages are required.

library(GSODR)
library("dplyr")
library(ggplot2)
library(lubridate)
library(stringr)
library(data.table)
library(curl)

Download Weather Station Database Inventories

inventory <- get_inventory()
head(inventory)

Filter weather stations information in a particular country

ctry_name='BENIN' # specify country name
stations=unique(inventory[which(inventory$COUNTRY_NAME==ctry_name),1:4])
stations

Quick view on raw file from ISD

I will consider the station “65306-099999”, 2016 data.

example_link="https://www.ncei.noaa.gov/data/global-hourly/access/2016/65306099999.csv"
curl_download(example_link,"65306099999_2016.csv")
data.table(read.csv('65306099999_2016.csv')[,c(1:2,11:16)])

As you can see, in the display above, the saving manner of the data and the protocol of storage does not help in using it directly. Therefore some post processing on the data is required. In order to be converted in the normal format for further analysis.

This post aims at providing a constructed functions dedicated to reformat the data.

# Function for processing mandatory variables  fron NCEI database
pros_mandatory_var=function(stat_dat){
  
  #convert colunm Date from character to Datetime format
  stat_dat$DATE<-ymd_hms(stat_dat$DATE)
  
  # subtract Mandatory Data and quality code
  spl=data.table(
    as.numeric(substr(stat_dat$TMP,1,5)),substr(stat_dat$TMP,7,9),#Temperature (TEMP)
    as.numeric(substr(stat_dat$DEW,1,5)),substr(stat_dat$DEW,7,7),#AIR-TEMPERATURE-OBSERVATION dew point temperature
    as.numeric(substr(stat_dat$WND,1,3)),substr(stat_dat$WND,5,5),substr(stat_dat$WND,7,7),#Wind direction (WD) and Wind type
    as.numeric(substr(stat_dat$WND,9,12)),substr(stat_dat$WND,14,14),#Wind speed rate (WSR)
    as.numeric(substr(stat_dat$CIG,1,5)),substr(stat_dat$CIG,7,7),#SKY-CONDITION-OBSERVATION ceiling height
    substr(stat_dat$CIG,9,9),substr(stat_dat$CIG,11,11),#SKY-CONDITION-OBSERVATION ceiling determination code
    as.numeric(substr(stat_dat$VIS,1,6)),substr(stat_dat$VIS,8,8),#VISIBILITY-OBSERVATION distance dimension
    substr(stat_dat$VIS,10,10),substr(stat_dat$VIS,12,12),#VISIBILITY-OBSERVATION variability code
    as.numeric(substr(stat_dat$SLP,1,5)),substr(stat_dat$SLP,7,7)#ATMOSPHERIC-PRESSURE-OBSERVATION sea level pressure
  )
  
  
  #Rename column names
  colnames(spl)<-c('TEMP','QC_TEMP',
                   'DEW','QC_DEW',
                   'WND','QC_WND','WND_TYPE',
                   'WSR','QC_WSR',
                   'CIG_H','QC_CIG_H',
                   'CIG_D','QC_CIG_D',
                   'VIS_D','QC_VIS_D',
                   'VIS_V','QC_VIS_V',
                   'SLP','QC_SLP')
  
  #Removing the scaling factor of 10 from values
  spl$TEMP<-spl$TEMP/10
  spl$WSR<-spl$WSR/10
  spl$DEW<-spl$DEW/10
  spl$SLP<-spl$SLP/10
  
  
  # Quality control and attribute NA to Suspect or Erroneous values from QC colunm 
  spl$TEMP[!(which(spl$QC_TEMP==1) || which(spl$QC_TEMP==5)|| 
               which(spl$QC_TEMP=='A')|| which(spl$QC_TEMP=='C') || which(spl$QC_TEMP=='I') ||
               which(spl$QC_TEMP=='M')|| which(spl$QC_TEMP=='P') || which(spl$QC_TEMP=='R') ||
               which(spl$QC_TEMP=='U'))]<-NA;
  
  spl$DEW[!(which(spl$QC_DEW==1) || which(spl$QC_DEW==5)|| 
              which(spl$QC_DEW=='A')|| which(spl$QC_DEW=='C') || which(spl$QC_DEW=='I') ||
              which(spl$QC_DEW=='M')|| which(spl$QC_DEW=='P') || which(spl$QC_DEW=='R') ||
              which(spl$QC_DEW=='U'))]<-NA;
  
  spl$WND[!(which(spl$QC_WND==1) || which(spl$QC_WND==5))]<-NA;
  spl$WSR[!(which(spl$QC_WSR==1) || which(spl$QC_WSR==5))]<-NA;
  spl$CIG_H[!(which(spl$QC_CIG_H==1) || which(spl$QC_CIG_H==5))]<-NA;
  spl$CIG_D[!(which(spl$QC_CIG_D=='N') || which(spl$QC_CIG_D=='Y'))]<-NA;
  spl$VIS_D[!(which(spl$QC_VIS_D==1) || which(spl$QC_VIS_D==5))]<-NA;
  spl$VIS_V[!(which(spl$QC_VIS_V=='N') || which(spl$QC_VIS_V=='Y'))]<-NA;
  spl$SLP[!(which(spl$QC_SLP==1) || which(spl$QC_SLP==5))]<-NA;
  
  
  # Output data
  attach(spl)
  OutDt=cbind(stat_dat[,c("NAME","DATE","LATITUDE","LONGITUDE")],
              TEMP,DEW,WND,WND_TYPE,WSR,CIG_H,CIG_D,VIS_D,VIS_V,SLP)
  detach(spl)
  
  return(OutDt)
  
}

# Function for downloading, cleaning, reformatting of global-hourly weather data based on stations and time-range
get_GHWD <-function(stationName,time_range){
  #Reformat stations name to correct form
  stationName=str_remove(stationName,'-')
  
  pathbase<-"https://www.ncei.noaa.gov/data/global-hourly/access/" #default access link
  export_file=NULL
  
  for (i in 1:length(stationName)){
    
    for (j in 1:length(time_range)){
      
      datapath=paste(pathbase,time_range[j],'/',stationName[i],'.csv',sep='')
      saved_name=paste(stationName[i],'_',time_range[j],'.csv',sep='')
      req <- curl_fetch_memory(datapath)
      
      if(req$status_code==404){
        print(paste('There is no data available for the station:',
                    stationName[i],'for the year ',time_range[j]))
        next
      }
      
      curl_download(datapath,saved_name)
      dt <- read.csv(saved_name)
      outfl<-pros_mandatory_var(dt)
      export_file=rbind(export_file,outfl)
      unlink(saved_name, recursive = TRUE) # delete file
      
    }
    
  }
  return(export_file)
  if(is.null(export_file)){print('There is no data for the entry stations names and time ranges')}
}

Post-processing of ISD data

For this exercice we will use the above stations and for a particular time-range.

stationName=stations$STNID[1:6] #select 6 stations
time_range=2018

#Download mandatory hourly variables for stations in Benin for 2018
dt=get_GHWD(stationName,time_range) #it may take sometimes depending of internet connection.

data.table(dt)

Monthly trend of min temperature per station

# Generate minimium and maximal daily temperature from hourly data
dt$DATE<-ymd_hms(dt$DATE)
dt$Day=as.Date(dt$DATE)
dt$MON=months(dt$Day)

daily_temp=dt %>% group_by(NAME,Day,MON)%>% 
  summarise(Tmin=min(TEMP,na.rm = TRUE),Tmax=max(TEMP,na.rm = TRUE))%>% 
  arrange(match(MON,month.name))
daily_temp

#Monthly mean
monmean_temp=daily_temp %>% group_by(NAME,MON)%>% 
  summarise(Tmin=mean(Tmin,na.rm = TRUE),Tmax=mean(Tmax,na.rm = TRUE))%>% 
  arrange(match(MON,month.name))
monmean_temp

Visualisation

# Set font
windowsFonts(B=windowsFont("Bookman Old Style"))


par(family = "B",par(mfrow=c(2,2)))
for(i in 1:4){
  
  temp=daily_temp[which(daily_temp$NAME==unique(daily_temp$NAME)[i]),]
  
  monmean=temp %>% group_by(NAME,MON)%>% 
    summarise(Tmin=mean(Tmin,na.rm = TRUE),Tmax=mean(Tmax,na.rm = TRUE))%>% 
    arrange(match(MON,month.name))
  
  boxplot(temp$Tmin ~ month(temp$Day),
          ylab="Deg C" , xlab="Months", 
          main=paste("Tmin - ",unique(temp$NAME)),
          axes=FALSE,col='#E37831')
  axis(1,at=c(1:12),labels = month.abb,las=3.5);axis(2,las=2);box()
  lines(1:12,monmean$Tmin,col="#ff7f00",lwd=3)
  legend(x="bottomright",legend="Monthly Mean",
         cex = 0.6,lwd = 3, col = "#ff7f00",lty = 1,bty='n')
  
}

4 comments:

Charles Vanya15 September 2020 at 16:59
Thierry, thank you very much for this wonderful work. The graphs looks very interesting and informative. I will try to workout on my country and see if I can make any sense out of the available data.
simonpierrek18 September 2020 at 19:17
This comment has been removed by the author.
Unknown20 November 2020 at 12:37
Very nice brother