"""Read module."""
from __future__ import annotations
import bz2
import collections
import datetime
import gzip
import linecache
import re
import warnings
import zipfile
from io import StringIO
from pathlib import Path
import chardet
import gsw
import numpy as np
import pandas as pd
def _basename(fname: str | Path) -> (str, str, str):
"""Return file name without path."""
if not isinstance(fname, Path):
fname = Path(fname)
path, name, ext = fname.parent, fname.stem, fname.suffix
return path, name, ext
def _normalize_names(name: str) -> str:
"""Normalize column names."""
name = name.strip()
return name.strip("*")
def _open_compressed(fname: Path) -> str:
"""Open compressed gzip, gz, zip or bz2 files."""
extension = fname.suffix.casefold()
loaders = {
".gzip": gzip.open,
".gz": gzip.open,
".bz2": bz2.BZ2File,
".zip": zipfile.ZipFile,
}
loader = loaders.get(extension)
if loader is None:
valid = ", ".join(loaders.keys())
msg = f"Unrecognized file extension. Expected {valid}, got {extension}."
raise ValueError(msg)
if extension == ".zip":
# NOTE: Zip format may contain more than one file in the archive
# (similar to tar), here we assume that there is just one file per
# zipfile! Also, we ask for the name because it can be different from
# the zipfile file!!
with loader(str(fname)) as zfile:
name = zfile.namelist()[0]
with zfile.open(name) as cfile:
return cfile.read()
with loader(str(fname)) as cfile:
return cfile.read()
def _read_file(fname: str | Path | StringIO) -> StringIO:
"""Read file contents, or read from StringIO object."""
if isinstance(fname, StringIO):
fname.seek(0)
text = fname.read()
return StringIO(text)
if not isinstance(fname, Path):
fname = Path(fname).resolve()
extension = fname.suffix.casefold()
if extension in [".gzip", ".gz", ".bz2", ".zip"]:
contents = _open_compressed(fname)
elif extension in [".cnv", ".edf", ".txt", ".ros", ".btl", ".bl", ".csv"]:
contents = fname.read_bytes()
else:
msg = (
"Unrecognized file extension. "
f"Expected .cnv, .edf, .txt, .ros, or .btl got {extension}"
)
raise ValueError(
msg,
)
# Read as bytes but we need to return strings for the parsers.
encoding = chardet.detect(contents)["encoding"]
if encoding is None:
encoding = "utf-8"
text = contents.decode(encoding=encoding, errors="replace")
return StringIO(text)
def _remane_duplicate_columns(names: str) -> str:
"""Rename a column when it is duplicated."""
items = collections.Counter(names).items()
dup = []
for item, count in items:
if count > 2: # noqa: PLR2004
msg = (
"Cannot handle more than two duplicated columns. "
f"Found {count} for {item}."
)
raise ValueError(
msg,
)
if count > 1:
dup.append(item)
# We can assume there are only two instances of a word in the list,
# we find the last index of an instance,
# which will be the second occurrence of the item.
second_occurrences = [
len(names) - names[::-1].index(item) - 1 for item in dup
]
for idx in second_occurrences:
names[idx] = f"{names[idx]}_"
return names
def _parse_seabird(lines: list, ftype: str) -> dict: # noqa: C901, PLR0912, PLR0915
"""Parse searbird formats."""
# Initialize variables.
lon = lat = time = None, None, None
fname = None
skiprows = 0
metadata = {}
header, config, names = [], [], []
for k, raw_line in enumerate(lines):
line = raw_line.strip()
# Only cnv has columns names,
# for bottle files we will use the variable row.
if ftype == "cnv" and "# name" in line:
name, unit = line.split("=")[1].split(":")
name, unit = list(map(_normalize_names, (name, unit)))
names.append(name)
# Seabird headers starts with *.
if line.startswith("*"):
header.append(line)
if "FileName" in line:
file_path = line.split("=")[-1].strip()
fname = Path(file_path).stem
# Seabird configuration starts with #.
if line.startswith("#"):
config.append(line)
# NMEA position and time.
if "NMEA Latitude" in line:
hemisphere = line[-1]
lat = line.strip(hemisphere).split("=")[1].strip()
lat = np.float64(lat.split())
if hemisphere == "S":
lat = -(lat[0] + lat[1] / 60.0)
elif hemisphere == "N":
lat = lat[0] + lat[1] / 60.0
else:
msg = "Latitude not recognized."
raise ValueError(msg)
if "NMEA Longitude" in line:
hemisphere = line[-1]
lon = line.strip(hemisphere).split("=")[1].strip()
lon = np.float64(lon.split())
if hemisphere == "W":
lon = -(lon[0] + lon[1] / 60.0)
elif hemisphere == "E":
lon = lon[0] + lon[1] / 60.0
else:
msg = "Latitude not recognized."
raise ValueError(msg)
if "NMEA UTC (Time)" in line:
time = line.split("=")[-1].strip()
# Should use some fuzzy datetime parser to make this more robust.
time = datetime.datetime.strptime(
time,
"%b %d %Y %H:%M:%S",
).astimezone(datetime.UTC)
# cnv file header ends with *END* while
if ftype == "cnv":
if line == "*END*":
skiprows = k + 1
break
else: # btl.
# There is no *END* like in a .cnv file, skip two after header info.
# Skip empty lines.
if not line:
continue
if not (line.startswith("*") | line.startswith("#")):
# Fix commonly occurring problem when Sbeox.* exists in the file
# the name is concatenated to previous parameter
# example:
# CStarAt0Sbeox0Mm/Kg to CStarAt0 Sbeox0Mm/Kg
line = re.sub(r"(\S)Sbeox", "\\1 Sbeox", line)
names = line.split()
skiprows = k + 2
break
if ftype == "btl":
# Capture stat names column.
names.append("Statistic")
metadata.update(
{
"name": fname or "unknown",
"header": "\n".join(header),
"config": "\n".join(config),
"names": _remane_duplicate_columns(names),
"skiprows": skiprows,
"time": time,
"lon": lon,
"lat": lat,
},
)
return metadata
[docs]
def from_bl(fname: str | Path) -> pd.DataFrame:
"""Read Seabird bottle-trip (bl) file.
Example:
-------
>>> from pathlib import Path
>>> import ctd
>>> data_path = Path(__file__).parents[1].joinpath("tests", "data")
>>> df = ctd.from_bl(str(data_path.joinpath("bl", "bottletest.bl")))
>>> df._metadata["time_of_reset"]
datetime.datetime(2018, 6, 25, 20, 8, 55)
"""
f = _read_file(fname)
cast = pd.read_csv(
f,
skiprows=2,
parse_dates=[1],
index_col=0,
names=["bottle_number", "time", "startscan", "endscan"],
)
cast._metadata = { # noqa: SLF001
"time_of_reset": pd.to_datetime(
linecache.getline(str(fname), 2)[6:-1],
).to_pydatetime(),
}
return cast
[docs]
def from_btl(fname: str | Path) -> pd.DataFrame:
"""DataFrame constructor to open Seabird CTD BTL-ASCII format.
Examples
--------
>>> from pathlib import Path
>>> import ctd
>>> data_path = Path(__file__).parents[1].joinpath("tests", "data")
>>> bottles = ctd.from_btl(data_path.joinpath("btl", "bottletest.btl"))
"""
f = _read_file(fname)
metadata = _parse_seabird(f.readlines(), ftype="btl")
f.seek(0)
cast = pd.read_fwf(
f,
header=None,
index_col=False,
names=metadata["names"],
parse_dates=False,
skiprows=metadata["skiprows"],
)
f.close()
# At this point the data frame is not correctly lined up (multiple rows
# for avg, std, min, max or just avg, std, etc).
# Also needs date,time,and bottle number to be converted to one per line.
# Get row types, see what you have: avg, std, min, max or just avg, std.
rowtypes = cast[cast.columns[-1]].unique()
# Get times and dates which occur on second line of each bottle.
date_idx = metadata["names"].index("Date")
dates = cast.iloc[:: len(rowtypes), date_idx].reset_index(drop=True)
times = cast.iloc[1 :: len(rowtypes), date_idx].reset_index(drop=True)
datetimes = dates + " " + times
# Fill the Date column with datetimes.
cast.loc[:: len(rowtypes), "Date"] = datetimes.to_numpy()
cast.loc[1 :: len(rowtypes), "Date"] = datetimes.to_numpy()
# Fill missing rows.
cast["Bottle"] = cast["Bottle"].ffill()
cast["Date"] = cast["Date"].ffill()
cast["Statistic"] = (
cast["Statistic"].str.lstrip("(").str.rstrip(")")
) # (avg) to avg
if "name" not in metadata:
name = _basename(fname)[1]
metadata["name"] = str(name)
dtypes = {
"bpos": int,
"pumps": bool,
"flag": bool,
"Bottle": int,
"Scan": int,
"Statistic": str,
"Date": str,
}
for column in cast.columns:
if column in dtypes:
cast[column] = cast[column].astype(dtypes[column])
else:
try:
cast[column] = cast[column].astype(float)
except ValueError:
warnings.warn(
f"Could not convert {column} to float.",
stacklevel=2,
)
cast["Date"] = pd.to_datetime(cast["Date"])
cast._metadata = metadata # noqa: SLF001
return cast
[docs]
def from_edf(fname: str | Path) -> pd.DataFrame: # noqa: C901, PLR0912
"""DataFrame constructor to open XBT EDF ASCII format.
Examples
--------
>>> from pathlib import Path
>>> import ctd
>>> data_path = Path(__file__).parents[1].joinpath("tests", "data")
>>> cast = ctd.from_edf(data_path.joinpath("XBT.EDF.gz"))
>>> ax = cast["temperature"].plot_cast()
"""
f = _read_file(fname)
header, names = [], []
for k, raw_line in enumerate(f.readlines()):
line = raw_line.strip()
if line.startswith("Serial Number"):
serial = line.strip().split(":")[1].strip()
elif line.startswith("Latitude"):
try:
hemisphere = line[-1]
lat = line.strip(hemisphere).split(":")[1].strip()
lat = np.float64(lat.split())
if hemisphere == "S":
lat = -(lat[0] + lat[1] / 60.0)
elif hemisphere == "N":
lat = lat[0] + lat[1] / 60.0
except (IndexError, ValueError):
lat = None
elif line.startswith("Longitude"):
try:
hemisphere = line[-1]
lon = line.strip(hemisphere).split(":")[1].strip()
lon = np.float64(lon.split())
if hemisphere == "W":
lon = -(lon[0] + lon[1] / 60.0)
elif hemisphere == "E":
lon = lon[0] + lon[1] / 60.0
except (IndexError, ValueError):
lon = None
else:
header.append(line)
if line.startswith("Field"):
_, unit = (ln.strip().casefold() for ln in line.split(":"))
names.append(unit.split()[0])
if line == "// Data":
skiprows = k + 1
break
f.seek(0)
cast = pd.read_csv(
f,
header=None,
index_col=None,
names=names,
skiprows=skiprows,
sep=r"\s+",
)
f.close()
cast = cast.set_index("depth", drop=True)
cast.index.name = "Depth [m]"
name = _basename(fname)[1]
metadata = {
"lon": lon,
"lat": lat,
"name": str(name),
"header": "\n".join(header),
"serial": serial,
}
cast._metadata = metadata # noqa: SLF001
return cast
[docs]
def from_cnv(fname: str | Path) -> pd.DataFrame:
"""DataFrame constructor to open Seabird CTD CNV-ASCII format.
Examples
--------
>>> from pathlib import Path
>>> import ctd
>>> data_path = Path(__file__).parents[1].joinpath("tests", "data")
>>> cast = ctd.from_cnv(data_path.joinpath("CTD_big.cnv.bz2"))
>>> downcast, upcast = cast.split()
>>> ax = downcast["t090C"].plot_cast()
"""
f = _read_file(fname)
metadata = _parse_seabird(f.readlines(), ftype="cnv")
f.seek(0)
lines = f.readlines()[metadata["skiprows"] :]
f.close()
data = [line.strip().split() for line in lines]
cast = pd.DataFrame(
data,
columns=metadata["names"],
)
dtypes = {"bpos": int, "pumps": bool, "flag": bool}
for column in cast.columns:
if column in dtypes:
cast[column] = cast[column].astype(dtypes[column])
else:
try:
cast[column] = pd.to_numeric(cast[column], errors="coerce")
except ValueError:
warnings.warn(
f"Could not convert {column} to float.",
stacklevel=2,
)
prkeys = [
"prM",
"prE",
"prDM",
"pr50M",
"pr50M1",
"prSM",
"prdM",
"pr",
"depSM",
"prDE",
]
cast.columns = cast.columns.str.strip()
prkey = [key for key in prkeys if key in cast.columns]
if len(prkey) == 0:
msg = "Expected one pressure/depth column, didn't receive any"
raise ValueError(
msg,
)
if len(prkey) > 1:
# If multiple keys present then keep the first one.
prkey = prkey[0]
cast = cast.set_index(prkey, drop=True)
cast.index.name = "Pressure [dbar]"
if prkey == "depSM":
lat = metadata.get("lat", None)
if lat is not None:
cast.index = gsw.p_from_z(
cast.index,
lat,
geo_strf_dyn_height=0,
sea_surface_geopotential=0,
)
else:
msg = (
"Missing latitude information. Cannot compute pressure! "
f"Your index is {prkey}, please compute pressure manually "
"with `gsw.p_from_z` and overwrite your index."
)
warnings.war(msg)
cast.index.name = prkey
if "name" not in metadata:
name = _basename(fname)[1]
metadata["name"] = str(name)
cast._metadata = metadata # noqa: SLF001
return cast
[docs]
def from_fsi(fname: str | Path, skiprows: int = 9) -> pd.DataFrame:
"""DataFrame constructor to open Falmouth Scientific, Inc. (FSI) CTD
ASCII format.
Examples
--------
>>> from pathlib import Path
>>> import ctd
>>> data_path = Path(__file__).parents[1].joinpath("tests", "data")
>>> cast = ctd.from_fsi(data_path.joinpath("FSI.txt.gz"))
>>> downcast, upcast = cast.split()
>>> ax = downcast["TEMP"].plot_cast()
"""
f = _read_file(fname)
fsi = pd.read_csv(
f,
header="infer",
index_col=None,
skiprows=skiprows,
dtype=float,
sep=r"\s+",
)
f.close()
fsi = fsi.set_index("PRES", drop=True)
fsi.index.name = "Pressure [dbar]"
metadata = {"name": str(fname)}
fsi._metadata = metadata # noqa: SLF001
return fsi
[docs]
def rosette_summary(fname: str | Path) -> pd.DataFrame:
"""Make a BTL (bottle) file from a ROS (bottle log) file.
More control for the averaging process and at which step we want to
perform this averaging eliminating the need to read the data into SBE
Software again after pre-processing.
NOTE: Do not run LoopEdit on the upcast!
Examples
--------
>>> from pathlib import Path
>>> import ctd
>>> data_path = Path(__file__).parents[1].joinpath("tests", "data")
>>> fname = data_path.joinpath("CTD/g01l01s01.ros")
>>> ros = ctd.rosette_summary(fname)
>>> ros = ros.groupby(ros.index).mean()
>>> ros.pressure.to_numpy().astype(int)
array([835, 806, 705, 604, 503, 404, 303, 201, 151, 100, 51, 1])
"""
ros = from_cnv(fname)
ros["pressure"] = ros.index.to_numpy().astype(float)
ros["nbf"] = ros["nbf"].astype(int)
metadata = ros._metadata # noqa: SLF001
ros = ros.set_index("nbf", drop=True)
ros._metadata = metadata # noqa: SLF001
return ros
[docs]
def from_castaway_csv(fname: str | Path) -> pd.DataFrame:
"""DataFrame constructor to open CastAway CSV format.
Example:
-------
>>> import ctd
>>> cast = ctd.from_castaway_csv("tests/data/castaway_data.csv")
>>> cast.columns
Index(['depth', 'temperature', 'conductivity', 'specific_conductance',
'salinity', 'sound_velocity', 'density'],
dtype='str')
"""
f = _read_file(fname)
lines = f.readlines()
# Strip newline characters
lines = [s.strip() for s in lines]
# Separate meta data and CTD profile
meta = [s for s in lines if s[0] == "%"][0:-1]
data = [s.split(",") for s in lines if s[0] != "%"]
cast = pd.DataFrame(data[1:-1], columns=data[0])
# Convert to numeric
for col in cast.columns:
cast[col] = pd.to_numeric(cast[col])
# Normalise column names and extract units
units = [s[s.find("(") + 1 : s.find(")")] for s in cast.columns]
cast.columns = [
_normalize_names(s.split("(")[0]).lower().replace(" ", "_")
for s in cast.columns
]
cast = cast.set_index("pressure", drop=True)
# Add metadata
meta = [s.replace("%", "").strip().split(",") for s in meta]
metadata = {}
for line in meta:
metadata[line[0]] = line[1]
metadata["units"] = units
cast._metadata = metadata # noqa: SLF001
return cast