Integrated Surface Data (ISD)

Post-processing of Global hourly weather data from Integrated Surface Data (ISD)

High-resolution data (temporal scale) are important to observe discernable change and to retrieve meaningful information in climate analysis. The finest the temporal resolution, better is the result and more biases removed.

This post seeks to shed light on how to download, cleaning and reformatting hourly weather data from the Integrated Surface Data (ISD).

This hourly data is provided by the US National Centers for Environmental Information (NCEI). This repository contains at least 6 mandatory variables on an hourly basis from various weather stations around the world. Namely:

1. WIND-OBSERVATION (WND).

2. SKY-CONDITION-OBSERVATION (CIG).

3. VISIBILITY-OBSERVATION (VIS).

4. AIR-TEMPERATURE-OBSERVATION (TMP).

5. AIR-TEMPERATURE-OBSERVATION dew point (DEW).

6. AIR-PRESSURE-OBSERVATION sea level pressure (SLP).

The hourly data are accessible from the following link: https://www.ncei.noaa.gov/data/global-hourly/access/

Download Weather Station Database Inventories

Filter weather stations information in a particular country

Quick view on raw file from ISD

I will consider the station “65306-099999”, 2016 data.

As you can see, in the display above, the saving manner of the data and the protocol of storage does not help in using it directly. Therefore some post processing on the data is required. In order to be converted in the normal format for further analysis.

This post aims at providing a constructed functions dedicated to reformat the data.

# Function for processing mandatory variables  fron NCEI database
pros_mandatory_var=function(stat_dat){
  
  #convert colunm Date from character to Datetime format
  stat_dat$DATE<-ymd_hms(stat_dat$DATE)
  
  # subtract Mandatory Data and quality code
  spl=data.table(
    as.numeric(substr(stat_dat$TMP,1,5)),substr(stat_dat$TMP,7,9),#Temperature (TEMP)
    as.numeric(substr(stat_dat$DEW,1,5)),substr(stat_dat$DEW,7,7),#AIR-TEMPERATURE-OBSERVATION dew point temperature
    as.numeric(substr(stat_dat$WND,1,3)),substr(stat_dat$WND,5,5),substr(stat_dat$WND,7,7),#Wind direction (WD) and Wind type
    as.numeric(substr(stat_dat$WND,9,12)),substr(stat_dat$WND,14,14),#Wind speed rate (WSR)
    as.numeric(substr(stat_dat$CIG,1,5)),substr(stat_dat$CIG,7,7),#SKY-CONDITION-OBSERVATION ceiling height
    substr(stat_dat$CIG,9,9),substr(stat_dat$CIG,11,11),#SKY-CONDITION-OBSERVATION ceiling determination code
    as.numeric(substr(stat_dat$VIS,1,6)),substr(stat_dat$VIS,8,8),#VISIBILITY-OBSERVATION distance dimension
    substr(stat_dat$VIS,10,10),substr(stat_dat$VIS,12,12),#VISIBILITY-OBSERVATION variability code
    as.numeric(substr(stat_dat$SLP,1,5)),substr(stat_dat$SLP,7,7)#ATMOSPHERIC-PRESSURE-OBSERVATION sea level pressure
  )
  
  
  #Rename column names
  colnames(spl)<-c('TEMP','QC_TEMP',
                   'DEW','QC_DEW',
                   'WND','QC_WND','WND_TYPE',
                   'WSR','QC_WSR',
                   'CIG_H','QC_CIG_H',
                   'CIG_D','QC_CIG_D',
                   'VIS_D','QC_VIS_D',
                   'VIS_V','QC_VIS_V',
                   'SLP','QC_SLP')
  
  #Removing the scaling factor of 10 from values
  spl$TEMP<-spl$TEMP/10
  spl$WSR<-spl$WSR/10
  spl$DEW<-spl$DEW/10
  spl$SLP<-spl$SLP/10
  
  
  # Quality control and attribute NA to Suspect or Erroneous values from QC colunm 
  spl$TEMP[!(which(spl$QC_TEMP==1) || which(spl$QC_TEMP==5)|| 
               which(spl$QC_TEMP=='A')|| which(spl$QC_TEMP=='C') || which(spl$QC_TEMP=='I') ||
               which(spl$QC_TEMP=='M')|| which(spl$QC_TEMP=='P') || which(spl$QC_TEMP=='R') ||
               which(spl$QC_TEMP=='U'))]<-NA;
  
  spl$DEW[!(which(spl$QC_DEW==1) || which(spl$QC_DEW==5)|| 
              which(spl$QC_DEW=='A')|| which(spl$QC_DEW=='C') || which(spl$QC_DEW=='I') ||
              which(spl$QC_DEW=='M')|| which(spl$QC_DEW=='P') || which(spl$QC_DEW=='R') ||
              which(spl$QC_DEW=='U'))]<-NA;
  
  spl$WND[!(which(spl$QC_WND==1) || which(spl$QC_WND==5))]<-NA;
  spl$WSR[!(which(spl$QC_WSR==1) || which(spl$QC_WSR==5))]<-NA;
  spl$CIG_H[!(which(spl$QC_CIG_H==1) || which(spl$QC_CIG_H==5))]<-NA;
  spl$CIG_D[!(which(spl$QC_CIG_D=='N') || which(spl$QC_CIG_D=='Y'))]<-NA;
  spl$VIS_D[!(which(spl$QC_VIS_D==1) || which(spl$QC_VIS_D==5))]<-NA;
  spl$VIS_V[!(which(spl$QC_VIS_V=='N') || which(spl$QC_VIS_V=='Y'))]<-NA;
  spl$SLP[!(which(spl$QC_SLP==1) || which(spl$QC_SLP==5))]<-NA;
  
  
  # Output data
  attach(spl)
  OutDt=cbind(stat_dat[,c("NAME","DATE","LATITUDE","LONGITUDE")],
              TEMP,DEW,WND,WND_TYPE,WSR,CIG_H,CIG_D,VIS_D,VIS_V,SLP)
  detach(spl)
  
  return(OutDt)
  
}

# Function for downloading, cleaning, reformatting of global-hourly weather data based on stations and time-range
get_GHWD <-function(stationName,time_range){
  #Reformat stations name to correct form
  stationName=str_remove(stationName,'-')
  
  pathbase<-"https://www.ncei.noaa.gov/data/global-hourly/access/" #default access link
  export_file=NULL
  
  for (i in 1:length(stationName)){
    
    for (j in 1:length(time_range)){
      
      datapath=paste(pathbase,time_range[j],'/',stationName[i],'.csv',sep='')
      saved_name=paste(stationName[i],'_',time_range[j],'.csv',sep='')
      req <- curl_fetch_memory(datapath)
      
      if(req$status_code==404){
        print(paste('There is no data available for the station:',
                    stationName[i],'for the year ',time_range[j]))
        next
      }
      
      curl_download(datapath,saved_name)
      dt <- read.csv(saved_name)
      outfl<-pros_mandatory_var(dt)
      export_file=rbind(export_file,outfl)
      unlink(saved_name, recursive = TRUE) # delete file
      
    }
    
  }
  return(export_file)
  if(is.null(export_file)){print('There is no data for the entry stations names and time ranges')}
}

Post-processing of ISD data

For this exercice we will use the above stations and for a particular time-range.

Monthly trend of min temperature per station