Load libraries

source(here::here("libs/db.R")) # defines variables: con, dir_gdrive
librarian::shelf(
  DT, glue, janitor, lubridate, mapview, purrr, readr, readxl, 
  sf, skimr, stringr, tidyr)
options(readr.show_col_types = F)
mapviewOptions(fgb = FALSE)

# helper functions ----

# convert station ID to lon, lat using the proj library
proj <- "/Users/bbest/homebrew/bin/proj" # installed on Ben's MacBookPro from https://proj.org/apps/proj.html
lonlat_to_stationid <- function(lon, lat){
  system(glue("echo {lon} {lat} | {proj} +proj=calcofi +epsg=4326 -f '%05.1f'"), intern=T) %>%
    stringr::str_replace("\t", " ")
}
stationid_to_lonlat <- function(stationid){
  # using 5th decimal place, a la CCE_Stations.txt
  system(glue("echo {stationid} | {proj} +proj=calcofi +epsg=4326  -I -d 5"), intern=T) %>%
    stringr::str_replace("\t", " ")
}
# (a_staid  <- stations$Sta_ID[1]) # "001.0 168.0"
# (a_lonlat <- stationid_to_lonlat(a_staid)) # 93.3 26.7
# a_lon <- str_split(a_lonlat, " ", simplify=T)[1] %>% as.double()
# a_lat <- str_split(a_lonlat, " ", simplify=T)[2] %>% as.double()
# lonlat_to_stationid(a_lon, a_lat) # "1.00 168.00" -> "001.0 168.0"
# "1.0 168.0"

Time for the chunk packages to run: 6.697 seconds

Read input tables

Initially copied from CalCOFI/calcofi4r:data-raw/bottle.R.

# paths ----

# oceanographic source paths
bottle_csv <- file.path(dir_gdrive, "data/oceanographic-data/bottle-database/CalCOFI_Database_194903-202001_csv_22Sep2021/194903-202001_Bottle.csv")
cast_csv   <- file.path(dir_gdrive, "data/oceanographic-data/bottle-database/CalCOFI_Database_194903-202001_csv_22Sep2021/194903-202001_Cast.csv")
# bottle_cast_rds <- file.path(dir_gdrive, "data/oceanographic-data/bottle-database/bottle_cast.rds")
DIC_csv    <- file.path(dir_gdrive, "data/DIC/CalCOFI_DICs_200901-201507_28June2018.csv")

# stations source paths
# source: Shonna Dovel <sdovel@ucsd.edu> 2022-03-17
stations_ccelter_xls <- file.path(dir_gdrive, "data/CalCOFI-stations/CalCOFI station LatLong.xlsx")
# sourc: http://cce.lternet.edu/data/gmt-mapper
stations_cce_txt     <- file.path(dir_gdrive, "data/CalCOFI-stations/CCE_Stations.txt")
stations_sccoos_txt  <- file.path(dir_gdrive, "data/CalCOFI-stations/SCCOOS_Stations.txt")

# study area destination paths
calcofi_geo           <- here("data/calcofi_oceano-bottle-stations_convex-hull.geojson")
calcofi_offshore_geo  <- here("data/calcofi_oceano-bottle-stations_convex-hull_offshore.geojson")
calcofi_nearshore_geo <- here("data/calcofi_oceano-bottle-stations_convex-hull_nearshore.geojson")

# check paths
stopifnot(all(file.exists(
  bottle_csv, cast_csv, DIC_csv,
  stations_ccelter_xls, stations_cce_txt, stations_sccoos_txt)))

# read csv sources ----

d_bottle <- read_csv(bottle_csv, skip=1, col_names = F, guess_max = 1000000)
#d_bottle_problems() <- problems()
names(d_bottle) <- str_split(
  readLines(bottle_csv, n=1), ",")[[1]] %>%
  str_replace("\xb5", "µ")

d_cast  <- read_csv(cast_csv)

d_DIC <- read_csv(DIC_csv, skip=1, col_names = F, guess_max = 1000000)
names(d_DIC) <- str_split(
  readLines(DIC_csv, n=1), ",")[[1]] %>%
  str_replace("\xb5", "µ")
# d_DIC %>% head() %>% View()
d_DIC <- d_DIC %>%
  rename("Sta_ID"="Line Sta_ID")

Time for the chunk paths to run: 62.65 seconds

stations

# get unique station IDs from the casts
stations <- d_cast %>%
  select(Sta_ID) %>%
  group_by(Sta_ID) %>%
  summarize() %>%
  mutate(
    is_cast     = TRUE,
    lonlat_proj = map_chr(Sta_ID, stationid_to_lonlat)) %>%
  separate(lonlat_proj, c("lon", "lat"), sep=" ", convert = T) %>%
  separate(
    Sta_ID, c("Sta_ID_line", "Sta_ID_station"), sep=" ", remove=F, convert=T) %>%
  mutate(
    is_offshore = ifelse(Sta_ID_station > 60, T, F)) %>%
  st_as_sf(
    coords = c("lon", "lat"), crs=4326, remove = F)
# stations$Sta_ID
# mapview(stations)

# + CCE field ----
stations_cce <- read_tsv(stations_cce_txt, skip = 2) %>%
  mutate(
    is_cce = TRUE,
    Sta_ID = map2_chr(LonDec, LatDec, lonlat_to_stationid))
stopifnot(sum(!stations_cce$Sta_ID %in% stations$Sta_ID) == 0)
stations <- stations %>%
  left_join(
    stations_cce %>%
      select(Sta_ID, is_cce),
    by = "Sta_ID")

# + CCE-LTER field ----
stations_ccelter <- read_excel(stations_ccelter_xls) %>%
  mutate(
    is_ccelter = TRUE,
    Sta_ID       = glue("{sprintf('%05.1f', Line)} {sprintf('%05.1f', Sta)}"),
    lonlat_proj  = map_chr(Sta_ID, stationid_to_lonlat)) %>%
  separate(
    lonlat_proj, c("lon", "lat"), sep=" ", convert = T) %>%
  select(Sta_ID, is_ccelter, lon, lon_0 = `Lon Dec`, lat, lat_0 = `Lat Dec`)
# stations_ccelter %>% View()

stopifnot(sum(!stations_ccelter$Sta_ID %in% stations$Sta_ID) == 0)
stations <- stations %>%
  left_join(
    stations_ccelter %>%
      select(Sta_ID, is_ccelter),
    by = "Sta_ID")

# + SCCOOS field ----
stations_sccoos <- read_tsv(stations_sccoos_txt) %>%
  mutate(
    is_sccoos = TRUE,
    Sta_ID = map2_chr(LonDec, LatDec, lonlat_to_stationid))
stopifnot(sum(!stations_sccoos$Sta_ID %in% stations$Sta_ID) == 0)
stations <- stations %>%
  left_join(
    stations_sccoos %>%
      select(Sta_ID, is_sccoos),
    by = "Sta_ID")

# stations_0 <- stations
stations <- stations %>% 
  mutate(across(where(is.logical), ~replace_na(.,FALSE)))

stations %>%
  select(is_cce, is_ccelter) %>%
  st_drop_geometry() %>%
  table(useNA = "ifany")
##        is_ccelter
## is_cce  FALSE TRUE
##   FALSE  2521    0
##   TRUE     47   66
#           is_cce
# is_ccelter FALSE TRUE
#      FALSE  2521   47
#      TRUE      0   66
# So: 47 extra stations in CCE that are missing in  CCE-LTER
stations %>%
  select(is_cce, is_sccoos) %>%
  st_drop_geometry() %>%
  table(useNA = "ifany")
##        is_sccoos
## is_cce  FALSE TRUE
##   FALSE  2521    0
##   TRUE    104    9
#        is_sccoos
# is_cce  FALSE TRUE
#   FALSE  2521    0
#   TRUE    104    9
# So: only 9 stations in SCCOS, none of which are in CCE

# field order ----
stations <- stations %>%
  select(
    sta_id = Sta_ID, 
    sta_id_line = Sta_ID_line, sta_id_station = Sta_ID_station,
    lon, lat,
    is_offshore,
    is_cce, is_ccelter, is_sccoos) %>% 
  rename(geom = geometry)

# write to database
st_write(stations, con, "stations")
# to get to show as layer in tile.calcofi.io
dbSendQuery(con, "SELECT UpdateGeometrySRID('stations', 'geom', 4326);")
## <PqResult>
##   SQL  SELECT UpdateGeometrySRID('stations', 'geom', 4326);
##   ROWS Fetched: 0 [incomplete]
##        Changed: 0
dbSendQuery(con, "CREATE INDEX IF NOT EXISTS stations_geom_idx ON stations USING GIST (geom);")
## <PqResult>
##   SQL  CREATE INDEX IF NOT EXISTS stations_geom_idx ON stations USING GIST (geom);
##   ROWS Fetched: 0 [complete]
##        Changed: 0
dbSendQuery(con, "CREATE INDEX IF NOT EXISTS stations_staid_idx ON stations (sta_id)")
## <PqResult>
##   SQL  CREATE INDEX IF NOT EXISTS stations_staid_idx ON stations (sta_id)
##   ROWS Fetched: 0 [complete]
##        Changed: 0
# show map of stations
mapview(stations)
# show table summary
stations %>% 
  st_drop_geometry() %>% 
  skim()
Data summary
Name Piped data
Number of rows 2634
Number of columns 9
_______________________
Column type frequency:
character 1
logical 4
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
sta_id 0 1 11 11 0 2634 0

Variable type: logical

skim_variable n_missing complete_rate mean count
is_offshore 0 1 0.42 FAL: 1534, TRU: 1100
is_cce 0 1 0.04 FAL: 2521, TRU: 113
is_ccelter 0 1 0.03 FAL: 2568, TRU: 66
is_sccoos 0 1 0.00 FAL: 2625, TRU: 9

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
sta_id_line 0 1 92.01 31.45 1.00 73.30 93.00 113.30 176.70 ▁▃▇▅▁
sta_id_station 0 1 72.87 67.35 0.00 39.00 56.00 80.00 531.00 ▇▁▁▁▁
lon 0 1 -120.74 6.71 -153.16 -123.96 -119.87 -116.33 -107.13 ▁▁▂▇▃
lat 0 1 31.64 5.28 17.41 28.52 31.60 34.49 47.94 ▁▅▇▂▁

Time for the chunk stations to run: 57.84 seconds

ctdcast

Note that cast is a reserved word in SQL for a function to convert data types, so we’re calling this table ctdcast instead of cast.

Writing queries to the database is also much easier if we don’t have to wrap field names in quotes. This means that all field names have to be lower case and not use any spaces, dashes or special characters. Thankfully janitor::make_clean_names() can handle this for us. We’ll just also keep track of the original field names in a separate table fields_renamed.

stopifnot(length(setdiff(unique(d_cast$Sta_ID), stations$sta_id)) == 0)

d_cast <- d_cast %>% 
  mutate(
    datetime = map2_chr(Date, Time, function(f_date, f_time){
      s_time <- ifelse(
        is.na(f_time), 
        "00:00:00",
        str_split(f_time, " ")[[1]][2])
      glue("{f_date} {s_time}") }) %>% 
      as_datetime(
        format = "%m/%d/%Y %H:%M:%S",
        tz = "America/Los_Angeles"), # TODO: confirm PST and not UTC timezone
    Date = as_date(Date, format = "%m/%d/%Y")) %>% 
  st_as_sf(
    coords = c("Lon_Dec", "Lat_Dec"), crs=4326, remove = F) %>%
  rename(geom = geometry)

# rename fields
flds_orig       <- names(d_cast)
flds            <- make_clean_names(flds_orig)
names(d_cast) <- flds
d_flds <- tibble(
  table      = "ctdcast",
  field_orig = flds_orig,
  field      = flds)

# write spatial points to database
st_write(d_cast, con, "ctdcast")
# to get to show as layer in tile.calcofi.io
dbSendQuery(con, "SELECT UpdateGeometrySRID('ctdcast', 'geom', 4326);")
## <PqResult>
##   SQL  SELECT UpdateGeometrySRID('ctdcast', 'geom', 4326);
##   ROWS Fetched: 0 [incomplete]
##        Changed: 0
dbSendQuery(con, "CREATE INDEX IF NOT EXISTS ctdcast_geom_idx ON ctdcast USING GIST (geom);")
## <PqResult>
##   SQL  CREATE INDEX IF NOT EXISTS ctdcast_geom_idx ON ctdcast USING GIST (geom);
##   ROWS Fetched: 0 [complete]
##        Changed: 0
dbSendQuery(con, "CREATE INDEX IF NOT EXISTS ctdcast_staid_idx ON ctdcast (sta_id)")
## <PqResult>
##   SQL  CREATE INDEX IF NOT EXISTS ctdcast_staid_idx ON ctdcast (sta_id)
##   ROWS Fetched: 0 [complete]
##        Changed: 0
dbSendQuery(con, "CREATE INDEX IF NOT EXISTS ctdcast_cstcnt_idx ON ctdcast (cst_cnt)")
## <PqResult>
##   SQL  CREATE INDEX IF NOT EXISTS ctdcast_cstcnt_idx ON ctdcast (cst_cnt)
##   ROWS Fetched: 0 [complete]
##        Changed: 0
dbSendQuery(con, "CREATE INDEX IF NOT EXISTS ctdcast_date_idx ON ctdcast (date)")
## <PqResult>
##   SQL  CREATE INDEX IF NOT EXISTS ctdcast_date_idx ON ctdcast (date)
##   ROWS Fetched: 0 [complete]
##        Changed: 0
dbSendQuery(con, "CREATE INDEX IF NOT EXISTS ctdcast_datetime_idx ON ctdcast (datetime)")
## <PqResult>
##   SQL  CREATE INDEX IF NOT EXISTS ctdcast_datetime_idx ON ctdcast (datetime)
##   ROWS Fetched: 0 [complete]
##        Changed: 0
dbSendQuery(con, "CREATE INDEX IF NOT EXISTS ctdcast_datetime_idx ON ctdcast (datetime)")
## <PqResult>
##   SQL  CREATE INDEX IF NOT EXISTS ctdcast_datetime_idx ON ctdcast (datetime)
##   ROWS Fetched: 0 [complete]
##        Changed: 0
# show table summary
d_cast %>% 
  st_drop_geometry() %>% 
  skim()
Data summary
Name Piped data
Number of rows 35376
Number of columns 62
_______________________
Column type frequency:
character 20
Date 1
numeric 40
POSIXct 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
cruise_id 0 1.00 17 17 0 658 0
db_sta_id 0 1.00 8 8 0 2634 0
cast_id 0 1.00 30 31 0 35376 0
sta_id 0 1.00 11 11 0 2634 0
sta_code 0 1.00 2 3 0 7 0
time 324 0.99 19 19 0 3659 0
lat_hem 0 1.00 1 1 0 1 0
lon_hem 0 1.00 1 1 0 1 0
rpt_line 0 1.00 3 5 0 698 0
rpt_sta 0 1.00 2 5 0 880 0
ship_name 0 1.00 4 24 0 36 0
ship_code 0 1.00 4 4 0 34 0
data_type 0 1.00 2 2 0 5 0
orig_sta_id 0 1.00 1 10 0 4190 0
data_or 16129 0.54 2 4 0 4 0
cruz_num 0 1.00 4 8 0 391 0
inc_str 33083 0.06 19 19 0 120 0
inc_end 33083 0.06 19 19 0 203 0
pst_lan 33083 0.06 19 19 0 80 0
civil_t 33084 0.06 19 19 0 181 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date 0 1 1949-02-28 2020-01-26 1972-02-08 6677

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
cst_cnt 0 1.00 1.768850e+04 1.021232e+04 1.00000e+00 8.844750e+03 1.768850e+04 2.653225e+04 3.537600e+04 ▇▇▇▇▇
cruise 0 1.00 1.975272e+05 2.076550e+03 1.94903e+05 1.957040e+05 1.972020e+05 1.990080e+05 2.020010e+05 ▇▅▃▂▂
cruz_sta 0 1.00 1.975273e+13 2.076544e+11 1.94903e+13 1.957041e+13 1.972021e+13 1.990081e+13 2.020011e+13 ▇▅▃▂▂
quarter 0 1.00 2.370000e+00 1.090000e+00 1.00000e+00 1.000000e+00 2.000000e+00 3.000000e+00 4.000000e+00 ▇▇▁▇▆
distance 14690 0.58 -1.155500e+02 1.165800e+02 -1.92015e+03 -1.698000e+02 -8.337000e+01 -2.928000e+01 -1.400000e-01 ▁▁▁▁▇
year 0 1.00 1.975210e+03 2.077000e+01 1.94900e+03 1.957000e+03 1.972000e+03 1.990000e+03 2.020000e+03 ▇▅▃▂▂
month 0 1.00 5.820000e+00 3.330000e+00 1.00000e+00 3.000000e+00 6.000000e+00 9.000000e+00 1.200000e+01 ▇▆▅▃▆
julian_date 0 1.00 2.730301e+04 7.013290e+03 1.79570e+04 2.093700e+04 2.633750e+04 3.308400e+04 4.111600e+04 ▇▃▃▃▃
julian_day 0 1.00 1.610200e+02 1.009800e+02 2.00000e+00 7.400000e+01 1.560000e+02 2.460000e+02 3.540000e+02 ▇▇▆▅▆
lat_dec 0 1.00 3.183000e+01 3.620000e+00 1.84200e+01 2.983000e+01 3.242000e+01 3.383000e+01 4.792000e+01 ▁▃▇▁▁
lat_deg 0 1.00 3.134000e+01 3.630000e+00 1.80000e+01 2.900000e+01 3.200000e+01 3.300000e+01 4.700000e+01 ▁▃▇▁▁
lat_min 0 1.00 2.978000e+01 1.724000e+01 0.00000e+00 1.476000e+01 2.910000e+01 4.500000e+01 5.990000e+01 ▇▇▇▆▇
lon_dec 0 1.00 -1.197900e+02 3.950000e+00 -1.64080e+02 -1.220800e+02 -1.196600e+02 -1.174900e+02 -1.059700e+02 ▁▁▁▇▃
lon_deg 0 1.00 -1.192800e+02 3.970000e+00 -1.64000e+02 -1.220000e+02 -1.190000e+02 -1.170000e+02 -1.050000e+02 ▁▁▁▇▂
lon_min 0 1.00 3.050000e+01 1.730000e+01 0.00000e+00 1.600000e+01 3.030000e+01 4.600000e+01 5.999000e+01 ▆▇▇▇▇
st_line 0 1.00 9.327000e+01 2.840000e+01 1.00000e+00 8.000000e+01 9.000000e+01 1.030000e+02 1.734000e+03 ▇▁▁▁▁
ac_line 0 1.00 9.306000e+01 2.201000e+01 0.00000e+00 8.040000e+01 9.000000e+01 1.035000e+02 1.765000e+02 ▁▂▇▃▁
st_station 0 1.00 6.210000e+01 3.284000e+01 5.00000e-01 4.000000e+01 5.500000e+01 8.000000e+01 5.310000e+02 ▇▁▁▁▁
ac_sta 0 1.00 6.167000e+01 3.318000e+01 -3.06000e+01 4.000000e+01 5.510000e+01 7.990000e+01 5.311000e+02 ▇▂▁▁▁
bottom_d 2281 0.94 2.243810e+03 1.612050e+03 7.00000e+00 6.470000e+02 2.061000e+03 3.840000e+03 8.230000e+03 ▇▃▇▁▁
secchi 29791 0.16 1.982000e+01 8.730000e+00 1.00000e+00 1.300000e+01 1.900000e+01 2.600000e+01 5.800000e+01 ▃▇▅▁▁
forel_u 33299 0.06 3.040000e+00 1.810000e+00 1.00000e+00 2.000000e+00 3.000000e+00 4.000000e+00 2.100000e+01 ▇▁▁▁▁
order_occ 24450 0.31 5.038000e+01 7.126000e+01 1.00000e+00 1.900000e+01 3.900000e+01 6.000000e+01 3.003000e+03 ▇▁▁▁▁
event_num 4 1.00 1.507270e+03 1.219840e+03 1.00000e+00 5.260000e+02 1.188000e+03 2.212000e+03 3.002200e+04 ▇▁▁▁▁
cruz_leg 32940 0.07 1.040000e+00 4.400000e-01 0.00000e+00 1.000000e+00 1.000000e+00 1.000000e+00 3.000000e+00 ▁▇▁▁▁
int_chl 22228 0.37 5.042000e+01 6.066000e+01 2.40000e+00 2.550000e+01 3.300000e+01 4.870000e+01 1.043200e+03 ▇▁▁▁▁
int_c14 33088 0.06 3.962600e+02 4.235200e+02 2.95000e+01 1.583500e+02 2.563500e+02 4.673800e+02 5.946400e+03 ▇▁▁▁▁
time_zone 33085 0.06 8.000000e+00 0.000000e+00 8.00000e+00 8.000000e+00 8.000000e+00 8.000000e+00 8.000000e+00 ▁▁▇▁▁
wave_dir 22311 0.37 2.950000e+01 8.140000e+00 0.00000e+00 2.800000e+01 3.100000e+01 3.300000e+01 4.900000e+01 ▁▁▃▇▁
wave_ht 22772 0.36 4.320000e+00 2.780000e+00 0.00000e+00 2.000000e+00 4.000000e+00 6.000000e+00 4.000000e+01 ▇▁▁▁▁
wave_prd 23240 0.34 6.470000e+00 2.290000e+00 0.00000e+00 5.000000e+00 6.000000e+00 8.000000e+00 5.000000e+01 ▇▁▁▁▁
wind_dir 1477 0.96 2.705000e+01 9.450000e+00 0.00000e+00 2.600000e+01 3.100000e+01 3.300000e+01 3.800000e+01 ▁▁▁▃▇
wind_spd 1679 0.95 1.114000e+01 6.830000e+00 0.00000e+00 6.000000e+00 1.000000e+01 1.500000e+01 9.000000e+01 ▇▁▁▁▁
barometer 15342 0.57 1.016250e+03 8.650000e+00 1.01400e+02 1.013700e+03 1.016300e+03 1.019100e+03 1.112600e+03 ▁▁▁▁▇
dry_t 15335 0.57 1.626000e+01 2.710000e+00 1.70000e+00 1.440000e+01 1.600000e+01 1.790000e+01 4.310000e+01 ▁▇▂▁▁
wet_t 15417 0.56 1.441000e+01 2.630000e+00 1.80000e+00 1.260000e+01 1.420000e+01 1.600000e+01 5.800000e+01 ▃▇▁▁▁
wea 8062 0.77 1.320000e+00 1.160000e+00 0.00000e+00 1.000000e+00 1.000000e+00 2.000000e+00 9.000000e+00 ▇▃▁▁▁
cloud_typ 25028 0.29 5.650000e+00 2.160000e+00 0.00000e+00 6.000000e+00 6.000000e+00 7.000000e+00 9.000000e+00 ▁▁▁▇▂
cloud_amt 22992 0.35 4.980000e+00 3.110000e+00 0.00000e+00 2.000000e+00 6.000000e+00 8.000000e+00 1.700000e+01 ▆▃▇▁▁
visibility 25363 0.28 6.800000e+00 1.520000e+00 0.00000e+00 6.000000e+00 7.000000e+00 8.000000e+00 9.000000e+00 ▁▁▁▇▅

Variable type: POSIXct

skim_variable n_missing complete_rate min max median n_unique
datetime 4 1 1949-02-28 22:42:00 2020-01-26 01:51:16 1972-02-08 22:25:00 34509

Time for the chunk ctdcast to run: 15.76 seconds

ctdcast_bottle

# rename fields
flds_orig       <- names(d_bottle)
flds            <- make_clean_names(flds_orig)
names(d_bottle) <- flds
d_flds <- bind_rows(
  d_flds,
  tibble(
    table      = "ctdcast_bottle",
    field_orig = flds_orig,
    field      = flds))

# write table to database
dbWriteTable(con, "ctdcast_bottle", d_bottle, overwrite=T)
dbSendQuery(con, "CREATE INDEX IF NOT EXISTS ctdcastbottle_staid_idx  ON ctdcast_bottle (sta_id)")
## <PqResult>
##   SQL  CREATE INDEX IF NOT EXISTS ctdcastbottle_staid_idx  ON ctdcast_bottle (sta_id)
##   ROWS Fetched: 0 [complete]
##        Changed: 0
dbSendQuery(con, "CREATE INDEX IF NOT EXISTS ctdcastbottle_cstcnt_idx ON ctdcast_bottle (cst_cnt)")
## <PqResult>
##   SQL  CREATE INDEX IF NOT EXISTS ctdcastbottle_cstcnt_idx ON ctdcast_bottle (cst_cnt)
##   ROWS Fetched: 0 [complete]
##        Changed: 0
dbSendQuery(con, "CREATE INDEX IF NOT EXISTS ctdcastbottle_btlcnt_idx ON ctdcast_bottle (btl_cnt)")
## <PqResult>
##   SQL  CREATE INDEX IF NOT EXISTS ctdcastbottle_btlcnt_idx ON ctdcast_bottle (btl_cnt)
##   ROWS Fetched: 0 [complete]
##        Changed: 0
# show table summary
skim(d_bottle)
Data summary
Name d_bottle
Number of rows 889500
Number of columns 62
_______________________
Column type frequency:
character 4
numeric 58
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
sta_id 0 1.00 11 11 0 2634 0
depth_id 0 1.00 38 38 0 889478 0
inc_tim 873953 0.02 19 19 0 203 0
dic_quality_comment 889445 0.00 28 117 0 37 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
cst_cnt 0 1.00 17630.51 10510.10 1.00 8483.00 17502.00 27262.00 35376.00 ▇▇▆▇▇
btl_cnt 0 1.00 444750.50 256776.68 1.00 222375.75 444750.50 667125.25 889500.00 ▇▇▇▇▇
depthm 0 1.00 224.85 312.89 0.00 45.00 125.00 300.00 5351.00 ▇▁▁▁▁
t_deg_c 10965 0.99 10.81 4.23 1.44 7.70 10.07 13.90 31.14 ▃▇▅▁▁
salnty 47355 0.95 33.84 0.46 28.37 33.49 33.86 34.19 37.03 ▁▁▃▇▁
o2ml_l 169507 0.81 3.40 2.07 -0.01 1.37 3.45 5.50 11.13 ▇▅▇▁▁
s_theta 52692 0.94 25.82 1.16 20.93 24.96 25.99 26.64 250.78 ▇▁▁▁▁
o2sat 204428 0.77 57.21 37.07 -0.10 21.30 54.50 97.63 214.10 ▇▅▆▁▁
oxy_mmol_kg 204439 0.77 149.09 90.10 -0.43 61.76 151.49 240.56 485.70 ▇▅▇▁▁
btl_num 753962 0.15 10.49 6.21 0.00 5.00 10.00 16.00 25.00 ▇▇▆▆▂
rec_ind 0 1.00 4.69 1.88 3.00 3.00 3.00 7.00 7.00 ▇▁▂▁▆
t_prec 10963 0.99 2.02 0.15 1.00 2.00 2.00 2.00 9.00 ▇▁▁▁▁
t_qual 866371 0.03 7.47 1.48 6.00 6.00 6.00 9.00 9.00 ▇▁▁▁▇
s_prec 47355 0.95 2.73 0.45 2.00 2.00 3.00 3.00 3.00 ▃▁▁▁▇
s_qual 806691 0.09 7.76 1.46 6.00 6.00 9.00 9.00 9.00 ▆▁▁▁▇
p_qual 215744 0.76 9.00 0.00 6.00 9.00 9.00 9.00 9.00 ▁▁▁▁▇
o_qual 693139 0.22 8.60 1.01 6.00 9.00 9.00 9.00 9.00 ▁▁▁▁▇
s_thtaq 815766 0.08 8.20 1.30 6.00 6.00 9.00 9.00 9.00 ▃▁▁▁▇
o2satq 660023 0.26 8.66 1.05 2.00 9.00 9.00 9.00 9.00 ▁▁▁▁▇
chlor_a 647809 0.27 0.45 1.21 0.00 0.05 0.16 0.39 66.11 ▇▁▁▁▁
chlqua 241760 0.73 9.00 0.01 8.00 9.00 9.00 9.00 9.00 ▁▁▁▁▇
phaeop 647810 0.27 0.20 0.37 -3.89 0.05 0.11 0.23 65.30 ▇▁▁▁▁
phaqua 241756 0.73 9.00 0.01 8.00 9.00 9.00 9.00 9.00 ▁▁▁▁▇
po4u_m 454050 0.49 1.56 1.04 0.00 0.46 1.57 2.47 5.21 ▇▅▆▁▁
po4q 435207 0.51 9.00 0.03 4.00 9.00 9.00 9.00 9.00 ▁▁▁▁▇
si_o3u_m 513274 0.42 26.50 27.47 0.00 3.10 18.00 41.59 196.00 ▇▂▁▁▁
si_o3qu 376055 0.58 9.00 0.02 4.00 9.00 9.00 9.00 9.00 ▁▁▁▁▇
no2u_m 530094 0.40 0.04 0.10 0.00 0.00 0.01 0.03 8.19 ▇▁▁▁▁
no2q 344103 0.61 8.97 0.17 4.00 9.00 9.00 9.00 9.00 ▁▁▁▁▇
no3u_m 529961 0.40 17.30 14.58 -0.40 0.60 18.10 30.00 95.00 ▇▆▂▁▁
no3q 352930 0.60 8.99 0.16 4.00 9.00 9.00 9.00 9.00 ▁▁▁▁▇
nh3u_m 803150 0.10 0.08 0.27 0.00 0.00 0.00 0.06 15.63 ▇▁▁▁▁
nh3q 61445 0.93 8.85 0.86 4.00 9.00 9.00 9.00 9.00 ▁▁▁▁▇
c14as1 873958 0.02 9.87 24.48 -0.24 0.95 2.69 8.10 584.50 ▇▁▁▁▁
c14a1p 876740 0.01 1.27 0.44 1.00 1.00 1.00 2.00 2.00 ▇▁▁▁▃
c14a1q 17377 0.98 9.00 0.00 8.00 9.00 9.00 9.00 9.00 ▁▁▁▁▇
c14as2 873976 0.02 9.91 25.88 -0.20 0.94 2.60 8.19 948.30 ▇▁▁▁▁
c14a2p 876758 0.01 1.27 0.44 1.00 1.00 1.00 2.00 2.00 ▇▁▁▁▃
c14a2q 17359 0.98 9.00 0.00 8.00 9.00 9.00 9.00 9.00 ▁▁▁▁▇
dark_as 865741 0.03 0.16 0.25 -0.01 0.06 0.10 0.18 9.82 ▇▁▁▁▁
dark_ap 869043 0.02 1.99 0.08 1.00 2.00 2.00 2.00 2.00 ▁▁▁▁▇
dark_aq 25542 0.97 9.00 0.00 8.00 9.00 9.00 9.00 9.00 ▁▁▁▁▇
mean_as 865740 0.03 8.54 22.14 -0.22 0.99 2.50 7.07 948.30 ▇▁▁▁▁
mean_ap 869043 0.02 1.26 0.44 1.00 1.00 1.00 2.00 2.00 ▇▁▁▁▃
mean_aq 25543 0.97 9.00 0.00 8.00 9.00 9.00 9.00 9.00 ▁▁▁▁▇
light_p 869733 0.02 17.34 29.29 0.00 0.25 1.40 21.00 99.90 ▇▁▁▁▁
r_depth 1 1.00 226.41 316.25 0.00 45.00 126.00 302.00 5458.00 ▇▁▁▁▁
r_temp 46050 0.95 10.86 4.22 0.00 7.75 10.11 13.94 31.14 ▂▇▅▁▁
r_sal 52781 0.94 221.32 93.29 0.40 144.10 203.80 300.20 683.40 ▃▇▅▁▁
r_dynht 46666 0.95 0.43 0.38 0.00 0.13 0.34 0.63 3.88 ▇▂▁▁▁
r_nuts 803130 0.10 0.08 0.27 0.00 0.00 0.00 0.06 15.63 ▇▁▁▁▁
r_oxy_mmol_kg 204439 0.77 149.09 90.10 -0.43 61.76 151.49 240.56 485.70 ▇▅▇▁▁
dic1 887501 0.00 2153.24 113.00 1948.85 2028.33 2170.64 2253.81 2367.80 ▇▃▅▇▃
dic2 889276 0.00 2168.15 154.85 1969.44 2008.98 2265.89 2315.52 2364.42 ▇▁▁▁▇
ta1 887416 0.00 2256.06 34.84 2181.57 2230.32 2244.32 2278.50 2434.90 ▅▇▃▁▁
ta2 889266 0.00 2278.86 58.50 2198.15 2229.06 2247.50 2316.45 2437.00 ▇▁▇▁▁
p_h1 889416 0.00 7.91 0.08 7.62 7.90 7.93 7.96 8.05 ▁▁▁▇▂
p_h2 889490 0.00 7.95 0.02 7.92 7.93 7.95 7.96 7.99 ▇▂▃▃▂

Time for the chunk ctdcast_bottle to run: 140 seconds

ctdcast_bottle_dic

# ensure bottle.btl_cnt is unique in 
stopifnot(d_bottle$btl_cnt %>% duplicated() %>% sum() == 0)

# rename fields
flds_orig       <- names(d_DIC)
flds            <- make_clean_names(flds_orig)
flds[which(flds_orig == "Cast_Index")]   <- "cst_cnt"
flds[which(flds_orig == "Bottle_Index")] <- "btl_cnt"
names(d_DIC) <- flds
d_flds <- bind_rows(
  d_flds,
  tibble(
    table       = "ctdcast_bottle_dic",
    field_orig = flds_orig,
    field      = flds))

# ensure uniquely matchable to cast_bottle
x <- anti_join(
  d_DIC,
  d_bottle, 
  by="btl_cnt")
stopifnot(nrow(x) == 0)

# write table to database
dbWriteTable(con, "ctdcast_bottle_dic", d_DIC, overwrite=T)
dbSendQuery(con, "CREATE INDEX IF NOT EXISTS ctdcastbottledic_staid_idx  ON ctdcast_bottle_dic (sta_id)")
## <PqResult>
##   SQL  CREATE INDEX IF NOT EXISTS ctdcastbottledic_staid_idx  ON ctdcast_bottle_dic (sta_id)
##   ROWS Fetched: 0 [complete]
##        Changed: 0
dbSendQuery(con, "CREATE INDEX IF NOT EXISTS ctdcastbottledic_cstcnt_idx ON ctdcast_bottle_dic (cst_cnt)")
## <PqResult>
##   SQL  CREATE INDEX IF NOT EXISTS ctdcastbottledic_cstcnt_idx ON ctdcast_bottle_dic (cst_cnt)
##   ROWS Fetched: 0 [complete]
##        Changed: 0
dbSendQuery(con, "CREATE INDEX IF NOT EXISTS ctdcastbottledic_btlcnt_idx ON ctdcast_bottle_dic (btl_cnt)")
## <PqResult>
##   SQL  CREATE INDEX IF NOT EXISTS ctdcastbottledic_btlcnt_idx ON ctdcast_bottle_dic (btl_cnt)
##   ROWS Fetched: 0 [complete]
##        Changed: 0
# show table summary
skim(d_DIC)
Data summary
Name d_DIC
Number of rows 2084
Number of columns 23
_______________________
Column type frequency:
character 3
numeric 20
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
sta_id 0 1.00 11 11 0 33 0
depth_id 0 1.00 38 38 0 2084 0
dic_quality_comment 2029 0.03 28 115 0 36 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 1058.07 613.45 1.00 521.75 1061.50 1585.25 2130.00 ▇▇▇▇▇
cst_cnt 0 1.00 33165.93 596.82 31785.00 32780.00 33312.00 33646.00 33973.00 ▂▂▅▆▇
btl_cnt 0 1.00 833038.33 15412.45 796974.00 823082.75 836834.00 845385.50 853785.00 ▂▂▅▅▇
cruise 0 1.00 201258.07 196.70 200808.00 201108.00 201304.00 201407.00 201507.00 ▂▁▆▃▇
depth_m 0 1.00 186.48 326.67 1.00 30.00 100.00 231.00 3542.00 ▇▁▁▁▁
dic1 85 0.96 2153.24 113.00 1948.85 2028.33 2170.64 2253.81 2367.80 ▇▃▅▇▃
dic2 1860 0.11 2168.15 154.85 1969.44 2008.98 2265.89 2315.52 2364.42 ▇▁▁▁▇
ta1 0 1.00 2256.06 34.84 2181.57 2230.32 2244.32 2278.50 2434.90 ▅▇▃▁▁
ta2 1850 0.11 2278.86 58.50 2198.15 2229.06 2247.50 2316.45 2437.00 ▇▁▇▁▁
p_h1 2000 0.04 7.91 0.08 7.62 7.90 7.93 7.96 8.05 ▁▁▁▇▂
p_h2 2074 0.00 7.95 0.02 7.92 7.93 7.95 7.96 7.99 ▇▂▃▃▂
salinity1 0 1.00 33.76 0.40 32.84 33.42 33.73 34.15 34.68 ▂▇▃▇▁
salinity2 1849 0.11 33.83 0.52 32.94 33.35 33.66 34.29 34.82 ▅▆▁▇▁
temperature_deg_c 0 1.00 11.05 3.80 1.52 8.25 10.00 14.01 22.75 ▁▇▅▃▁
bottle_salinity 0 1.00 33.77 0.40 32.84 33.42 33.74 34.15 34.68 ▂▇▃▇▁
bottle_o2_ml_l 0 1.00 3.40 2.14 0.00 1.36 3.22 5.64 7.81 ▆▅▂▇▁
bottle_o2_mmol_kg 0 1.00 148.28 93.39 0.00 59.31 140.12 245.95 340.42 ▆▅▂▇▁
sigma_theta 0 1.00 25.74 0.98 22.98 24.91 25.96 26.58 27.78 ▁▅▅▇▂
dic_bottle_id1 0 1.00 4269.62 3603.41 1.00 390.75 2952.50 7503.25 10004.00 ▇▂▂▃▅
dic_bottle_id2 1852 0.11 3519.40 3441.65 2.00 331.25 2247.00 6907.50 9959.00 ▇▂▂▂▃

Time for the chunk ctdcast_bottle_dic to run: 2.989 seconds

fields_renamed

# write table to database
dbWriteTable(con, "fields_renamed", d_flds, overwrite=T)

# show table
datatable(d_flds)

Time for the chunk fields_renamed to run: 1.857 seconds