Updated script that can be controled by Nodejs web app

This commit is contained in:
mac OS
2024-11-25 12:24:18 +07:00
parent c440eda1f4
commit 8b0ab2bd3a
8662 changed files with 1803808 additions and 34 deletions

View File

@ -0,0 +1,3 @@
from pandas.io.sas.sasreader import read_sas
__all__ = ["read_sas"]

View File

@ -0,0 +1,756 @@
"""
Read SAS7BDAT files
Based on code written by Jared Hobbs:
https://bitbucket.org/jaredhobbs/sas7bdat
See also:
https://github.com/BioStatMatt/sas7bdat
Partial documentation of the file format:
https://cran.r-project.org/package=sas7bdat/vignettes/sas7bdat.pdf
Reference for binary data compression:
http://collaboration.cmc.ec.gc.ca/science/rpn/biblio/ddj/Website/articles/CUJ/1992/9210/ross/ross.htm
"""
from __future__ import annotations
from collections import abc
from datetime import (
datetime,
timedelta,
)
import sys
from typing import TYPE_CHECKING
import numpy as np
from pandas._libs.byteswap import (
read_double_with_byteswap,
read_float_with_byteswap,
read_uint16_with_byteswap,
read_uint32_with_byteswap,
read_uint64_with_byteswap,
)
from pandas._libs.sas import (
Parser,
get_subheader_index,
)
from pandas._libs.tslibs.conversion import cast_from_unit_vectorized
from pandas.errors import EmptyDataError
import pandas as pd
from pandas import (
DataFrame,
Timestamp,
isna,
)
from pandas.io.common import get_handle
import pandas.io.sas.sas_constants as const
from pandas.io.sas.sasreader import ReaderBase
if TYPE_CHECKING:
from pandas._typing import (
CompressionOptions,
FilePath,
ReadBuffer,
)
_unix_origin = Timestamp("1970-01-01")
_sas_origin = Timestamp("1960-01-01")
def _parse_datetime(sas_datetime: float, unit: str):
if isna(sas_datetime):
return pd.NaT
if unit == "s":
return datetime(1960, 1, 1) + timedelta(seconds=sas_datetime)
elif unit == "d":
return datetime(1960, 1, 1) + timedelta(days=sas_datetime)
else:
raise ValueError("unit must be 'd' or 's'")
def _convert_datetimes(sas_datetimes: pd.Series, unit: str) -> pd.Series:
"""
Convert to Timestamp if possible, otherwise to datetime.datetime.
SAS float64 lacks precision for more than ms resolution so the fit
to datetime.datetime is ok.
Parameters
----------
sas_datetimes : {Series, Sequence[float]}
Dates or datetimes in SAS
unit : {'d', 's'}
"d" if the floats represent dates, "s" for datetimes
Returns
-------
Series
Series of datetime64 dtype or datetime.datetime.
"""
td = (_sas_origin - _unix_origin).as_unit("s")
if unit == "s":
millis = cast_from_unit_vectorized(
sas_datetimes._values, unit="s", out_unit="ms"
)
dt64ms = millis.view("M8[ms]") + td
return pd.Series(dt64ms, index=sas_datetimes.index, copy=False)
else:
vals = np.array(sas_datetimes, dtype="M8[D]") + td
return pd.Series(vals, dtype="M8[s]", index=sas_datetimes.index, copy=False)
class _Column:
col_id: int
name: str | bytes
label: str | bytes
format: str | bytes
ctype: bytes
length: int
def __init__(
self,
col_id: int,
# These can be bytes when convert_header_text is False
name: str | bytes,
label: str | bytes,
format: str | bytes,
ctype: bytes,
length: int,
) -> None:
self.col_id = col_id
self.name = name
self.label = label
self.format = format
self.ctype = ctype
self.length = length
# SAS7BDAT represents a SAS data file in SAS7BDAT format.
class SAS7BDATReader(ReaderBase, abc.Iterator):
"""
Read SAS files in SAS7BDAT format.
Parameters
----------
path_or_buf : path name or buffer
Name of SAS file or file-like object pointing to SAS file
contents.
index : column identifier, defaults to None
Column to use as index.
convert_dates : bool, defaults to True
Attempt to convert dates to Pandas datetime values. Note that
some rarely used SAS date formats may be unsupported.
blank_missing : bool, defaults to True
Convert empty strings to missing values (SAS uses blanks to
indicate missing character variables).
chunksize : int, defaults to None
Return SAS7BDATReader object for iterations, returns chunks
with given number of lines.
encoding : str, 'infer', defaults to None
String encoding acc. to Python standard encodings,
encoding='infer' tries to detect the encoding from the file header,
encoding=None will leave the data in binary format.
convert_text : bool, defaults to True
If False, text variables are left as raw bytes.
convert_header_text : bool, defaults to True
If False, header text, including column names, are left as raw
bytes.
"""
_int_length: int
_cached_page: bytes | None
def __init__(
self,
path_or_buf: FilePath | ReadBuffer[bytes],
index=None,
convert_dates: bool = True,
blank_missing: bool = True,
chunksize: int | None = None,
encoding: str | None = None,
convert_text: bool = True,
convert_header_text: bool = True,
compression: CompressionOptions = "infer",
) -> None:
self.index = index
self.convert_dates = convert_dates
self.blank_missing = blank_missing
self.chunksize = chunksize
self.encoding = encoding
self.convert_text = convert_text
self.convert_header_text = convert_header_text
self.default_encoding = "latin-1"
self.compression = b""
self.column_names_raw: list[bytes] = []
self.column_names: list[str | bytes] = []
self.column_formats: list[str | bytes] = []
self.columns: list[_Column] = []
self._current_page_data_subheader_pointers: list[tuple[int, int]] = []
self._cached_page = None
self._column_data_lengths: list[int] = []
self._column_data_offsets: list[int] = []
self._column_types: list[bytes] = []
self._current_row_in_file_index = 0
self._current_row_on_page_index = 0
self._current_row_in_file_index = 0
self.handles = get_handle(
path_or_buf, "rb", is_text=False, compression=compression
)
self._path_or_buf = self.handles.handle
# Same order as const.SASIndex
self._subheader_processors = [
self._process_rowsize_subheader,
self._process_columnsize_subheader,
self._process_subheader_counts,
self._process_columntext_subheader,
self._process_columnname_subheader,
self._process_columnattributes_subheader,
self._process_format_subheader,
self._process_columnlist_subheader,
None, # Data
]
try:
self._get_properties()
self._parse_metadata()
except Exception:
self.close()
raise
def column_data_lengths(self) -> np.ndarray:
"""Return a numpy int64 array of the column data lengths"""
return np.asarray(self._column_data_lengths, dtype=np.int64)
def column_data_offsets(self) -> np.ndarray:
"""Return a numpy int64 array of the column offsets"""
return np.asarray(self._column_data_offsets, dtype=np.int64)
def column_types(self) -> np.ndarray:
"""
Returns a numpy character array of the column types:
s (string) or d (double)
"""
return np.asarray(self._column_types, dtype=np.dtype("S1"))
def close(self) -> None:
self.handles.close()
def _get_properties(self) -> None:
# Check magic number
self._path_or_buf.seek(0)
self._cached_page = self._path_or_buf.read(288)
if self._cached_page[0 : len(const.magic)] != const.magic:
raise ValueError("magic number mismatch (not a SAS file?)")
# Get alignment information
buf = self._read_bytes(const.align_1_offset, const.align_1_length)
if buf == const.u64_byte_checker_value:
self.U64 = True
self._int_length = 8
self._page_bit_offset = const.page_bit_offset_x64
self._subheader_pointer_length = const.subheader_pointer_length_x64
else:
self.U64 = False
self._page_bit_offset = const.page_bit_offset_x86
self._subheader_pointer_length = const.subheader_pointer_length_x86
self._int_length = 4
buf = self._read_bytes(const.align_2_offset, const.align_2_length)
if buf == const.align_1_checker_value:
align1 = const.align_2_value
else:
align1 = 0
# Get endianness information
buf = self._read_bytes(const.endianness_offset, const.endianness_length)
if buf == b"\x01":
self.byte_order = "<"
self.need_byteswap = sys.byteorder == "big"
else:
self.byte_order = ">"
self.need_byteswap = sys.byteorder == "little"
# Get encoding information
buf = self._read_bytes(const.encoding_offset, const.encoding_length)[0]
if buf in const.encoding_names:
self.inferred_encoding = const.encoding_names[buf]
if self.encoding == "infer":
self.encoding = self.inferred_encoding
else:
self.inferred_encoding = f"unknown (code={buf})"
# Timestamp is epoch 01/01/1960
epoch = datetime(1960, 1, 1)
x = self._read_float(
const.date_created_offset + align1, const.date_created_length
)
self.date_created = epoch + pd.to_timedelta(x, unit="s")
x = self._read_float(
const.date_modified_offset + align1, const.date_modified_length
)
self.date_modified = epoch + pd.to_timedelta(x, unit="s")
self.header_length = self._read_uint(
const.header_size_offset + align1, const.header_size_length
)
# Read the rest of the header into cached_page.
buf = self._path_or_buf.read(self.header_length - 288)
self._cached_page += buf
# error: Argument 1 to "len" has incompatible type "Optional[bytes]";
# expected "Sized"
if len(self._cached_page) != self.header_length: # type: ignore[arg-type]
raise ValueError("The SAS7BDAT file appears to be truncated.")
self._page_length = self._read_uint(
const.page_size_offset + align1, const.page_size_length
)
def __next__(self) -> DataFrame:
da = self.read(nrows=self.chunksize or 1)
if da.empty:
self.close()
raise StopIteration
return da
# Read a single float of the given width (4 or 8).
def _read_float(self, offset: int, width: int):
assert self._cached_page is not None
if width == 4:
return read_float_with_byteswap(
self._cached_page, offset, self.need_byteswap
)
elif width == 8:
return read_double_with_byteswap(
self._cached_page, offset, self.need_byteswap
)
else:
self.close()
raise ValueError("invalid float width")
# Read a single unsigned integer of the given width (1, 2, 4 or 8).
def _read_uint(self, offset: int, width: int) -> int:
assert self._cached_page is not None
if width == 1:
return self._read_bytes(offset, 1)[0]
elif width == 2:
return read_uint16_with_byteswap(
self._cached_page, offset, self.need_byteswap
)
elif width == 4:
return read_uint32_with_byteswap(
self._cached_page, offset, self.need_byteswap
)
elif width == 8:
return read_uint64_with_byteswap(
self._cached_page, offset, self.need_byteswap
)
else:
self.close()
raise ValueError("invalid int width")
def _read_bytes(self, offset: int, length: int):
assert self._cached_page is not None
if offset + length > len(self._cached_page):
self.close()
raise ValueError("The cached page is too small.")
return self._cached_page[offset : offset + length]
def _read_and_convert_header_text(self, offset: int, length: int) -> str | bytes:
return self._convert_header_text(
self._read_bytes(offset, length).rstrip(b"\x00 ")
)
def _parse_metadata(self) -> None:
done = False
while not done:
self._cached_page = self._path_or_buf.read(self._page_length)
if len(self._cached_page) <= 0:
break
if len(self._cached_page) != self._page_length:
raise ValueError("Failed to read a meta data page from the SAS file.")
done = self._process_page_meta()
def _process_page_meta(self) -> bool:
self._read_page_header()
pt = const.page_meta_types + [const.page_amd_type, const.page_mix_type]
if self._current_page_type in pt:
self._process_page_metadata()
is_data_page = self._current_page_type == const.page_data_type
is_mix_page = self._current_page_type == const.page_mix_type
return bool(
is_data_page
or is_mix_page
or self._current_page_data_subheader_pointers != []
)
def _read_page_header(self) -> None:
bit_offset = self._page_bit_offset
tx = const.page_type_offset + bit_offset
self._current_page_type = (
self._read_uint(tx, const.page_type_length) & const.page_type_mask2
)
tx = const.block_count_offset + bit_offset
self._current_page_block_count = self._read_uint(tx, const.block_count_length)
tx = const.subheader_count_offset + bit_offset
self._current_page_subheaders_count = self._read_uint(
tx, const.subheader_count_length
)
def _process_page_metadata(self) -> None:
bit_offset = self._page_bit_offset
for i in range(self._current_page_subheaders_count):
offset = const.subheader_pointers_offset + bit_offset
total_offset = offset + self._subheader_pointer_length * i
subheader_offset = self._read_uint(total_offset, self._int_length)
total_offset += self._int_length
subheader_length = self._read_uint(total_offset, self._int_length)
total_offset += self._int_length
subheader_compression = self._read_uint(total_offset, 1)
total_offset += 1
subheader_type = self._read_uint(total_offset, 1)
if (
subheader_length == 0
or subheader_compression == const.truncated_subheader_id
):
continue
subheader_signature = self._read_bytes(subheader_offset, self._int_length)
subheader_index = get_subheader_index(subheader_signature)
subheader_processor = self._subheader_processors[subheader_index]
if subheader_processor is None:
f1 = subheader_compression in (const.compressed_subheader_id, 0)
f2 = subheader_type == const.compressed_subheader_type
if self.compression and f1 and f2:
self._current_page_data_subheader_pointers.append(
(subheader_offset, subheader_length)
)
else:
self.close()
raise ValueError(
f"Unknown subheader signature {subheader_signature}"
)
else:
subheader_processor(subheader_offset, subheader_length)
def _process_rowsize_subheader(self, offset: int, length: int) -> None:
int_len = self._int_length
lcs_offset = offset
lcp_offset = offset
if self.U64:
lcs_offset += 682
lcp_offset += 706
else:
lcs_offset += 354
lcp_offset += 378
self.row_length = self._read_uint(
offset + const.row_length_offset_multiplier * int_len,
int_len,
)
self.row_count = self._read_uint(
offset + const.row_count_offset_multiplier * int_len,
int_len,
)
self.col_count_p1 = self._read_uint(
offset + const.col_count_p1_multiplier * int_len, int_len
)
self.col_count_p2 = self._read_uint(
offset + const.col_count_p2_multiplier * int_len, int_len
)
mx = const.row_count_on_mix_page_offset_multiplier * int_len
self._mix_page_row_count = self._read_uint(offset + mx, int_len)
self._lcs = self._read_uint(lcs_offset, 2)
self._lcp = self._read_uint(lcp_offset, 2)
def _process_columnsize_subheader(self, offset: int, length: int) -> None:
int_len = self._int_length
offset += int_len
self.column_count = self._read_uint(offset, int_len)
if self.col_count_p1 + self.col_count_p2 != self.column_count:
print(
f"Warning: column count mismatch ({self.col_count_p1} + "
f"{self.col_count_p2} != {self.column_count})\n"
)
# Unknown purpose
def _process_subheader_counts(self, offset: int, length: int) -> None:
pass
def _process_columntext_subheader(self, offset: int, length: int) -> None:
offset += self._int_length
text_block_size = self._read_uint(offset, const.text_block_size_length)
buf = self._read_bytes(offset, text_block_size)
cname_raw = buf[0:text_block_size].rstrip(b"\x00 ")
self.column_names_raw.append(cname_raw)
if len(self.column_names_raw) == 1:
compression_literal = b""
for cl in const.compression_literals:
if cl in cname_raw:
compression_literal = cl
self.compression = compression_literal
offset -= self._int_length
offset1 = offset + 16
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcp)
compression_literal = buf.rstrip(b"\x00")
if compression_literal == b"":
self._lcs = 0
offset1 = offset + 32
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcp)
self.creator_proc = buf[0 : self._lcp]
elif compression_literal == const.rle_compression:
offset1 = offset + 40
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcp)
self.creator_proc = buf[0 : self._lcp]
elif self._lcs > 0:
self._lcp = 0
offset1 = offset + 16
if self.U64:
offset1 += 4
buf = self._read_bytes(offset1, self._lcs)
self.creator_proc = buf[0 : self._lcp]
if hasattr(self, "creator_proc"):
self.creator_proc = self._convert_header_text(self.creator_proc)
def _process_columnname_subheader(self, offset: int, length: int) -> None:
int_len = self._int_length
offset += int_len
column_name_pointers_count = (length - 2 * int_len - 12) // 8
for i in range(column_name_pointers_count):
text_subheader = (
offset
+ const.column_name_pointer_length * (i + 1)
+ const.column_name_text_subheader_offset
)
col_name_offset = (
offset
+ const.column_name_pointer_length * (i + 1)
+ const.column_name_offset_offset
)
col_name_length = (
offset
+ const.column_name_pointer_length * (i + 1)
+ const.column_name_length_offset
)
idx = self._read_uint(
text_subheader, const.column_name_text_subheader_length
)
col_offset = self._read_uint(
col_name_offset, const.column_name_offset_length
)
col_len = self._read_uint(col_name_length, const.column_name_length_length)
name_raw = self.column_names_raw[idx]
cname = name_raw[col_offset : col_offset + col_len]
self.column_names.append(self._convert_header_text(cname))
def _process_columnattributes_subheader(self, offset: int, length: int) -> None:
int_len = self._int_length
column_attributes_vectors_count = (length - 2 * int_len - 12) // (int_len + 8)
for i in range(column_attributes_vectors_count):
col_data_offset = (
offset + int_len + const.column_data_offset_offset + i * (int_len + 8)
)
col_data_len = (
offset
+ 2 * int_len
+ const.column_data_length_offset
+ i * (int_len + 8)
)
col_types = (
offset + 2 * int_len + const.column_type_offset + i * (int_len + 8)
)
x = self._read_uint(col_data_offset, int_len)
self._column_data_offsets.append(x)
x = self._read_uint(col_data_len, const.column_data_length_length)
self._column_data_lengths.append(x)
x = self._read_uint(col_types, const.column_type_length)
self._column_types.append(b"d" if x == 1 else b"s")
def _process_columnlist_subheader(self, offset: int, length: int) -> None:
# unknown purpose
pass
def _process_format_subheader(self, offset: int, length: int) -> None:
int_len = self._int_length
text_subheader_format = (
offset + const.column_format_text_subheader_index_offset + 3 * int_len
)
col_format_offset = offset + const.column_format_offset_offset + 3 * int_len
col_format_len = offset + const.column_format_length_offset + 3 * int_len
text_subheader_label = (
offset + const.column_label_text_subheader_index_offset + 3 * int_len
)
col_label_offset = offset + const.column_label_offset_offset + 3 * int_len
col_label_len = offset + const.column_label_length_offset + 3 * int_len
x = self._read_uint(
text_subheader_format, const.column_format_text_subheader_index_length
)
format_idx = min(x, len(self.column_names_raw) - 1)
format_start = self._read_uint(
col_format_offset, const.column_format_offset_length
)
format_len = self._read_uint(col_format_len, const.column_format_length_length)
label_idx = self._read_uint(
text_subheader_label, const.column_label_text_subheader_index_length
)
label_idx = min(label_idx, len(self.column_names_raw) - 1)
label_start = self._read_uint(
col_label_offset, const.column_label_offset_length
)
label_len = self._read_uint(col_label_len, const.column_label_length_length)
label_names = self.column_names_raw[label_idx]
column_label = self._convert_header_text(
label_names[label_start : label_start + label_len]
)
format_names = self.column_names_raw[format_idx]
column_format = self._convert_header_text(
format_names[format_start : format_start + format_len]
)
current_column_number = len(self.columns)
col = _Column(
current_column_number,
self.column_names[current_column_number],
column_label,
column_format,
self._column_types[current_column_number],
self._column_data_lengths[current_column_number],
)
self.column_formats.append(column_format)
self.columns.append(col)
def read(self, nrows: int | None = None) -> DataFrame:
if (nrows is None) and (self.chunksize is not None):
nrows = self.chunksize
elif nrows is None:
nrows = self.row_count
if len(self._column_types) == 0:
self.close()
raise EmptyDataError("No columns to parse from file")
if nrows > 0 and self._current_row_in_file_index >= self.row_count:
return DataFrame()
nrows = min(nrows, self.row_count - self._current_row_in_file_index)
nd = self._column_types.count(b"d")
ns = self._column_types.count(b"s")
self._string_chunk = np.empty((ns, nrows), dtype=object)
self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
self._current_row_in_chunk_index = 0
p = Parser(self)
p.read(nrows)
rslt = self._chunk_to_dataframe()
if self.index is not None:
rslt = rslt.set_index(self.index)
return rslt
def _read_next_page(self):
self._current_page_data_subheader_pointers = []
self._cached_page = self._path_or_buf.read(self._page_length)
if len(self._cached_page) <= 0:
return True
elif len(self._cached_page) != self._page_length:
self.close()
msg = (
"failed to read complete page from file (read "
f"{len(self._cached_page):d} of {self._page_length:d} bytes)"
)
raise ValueError(msg)
self._read_page_header()
if self._current_page_type in const.page_meta_types:
self._process_page_metadata()
if self._current_page_type not in const.page_meta_types + [
const.page_data_type,
const.page_mix_type,
]:
return self._read_next_page()
return False
def _chunk_to_dataframe(self) -> DataFrame:
n = self._current_row_in_chunk_index
m = self._current_row_in_file_index
ix = range(m - n, m)
rslt = {}
js, jb = 0, 0
for j in range(self.column_count):
name = self.column_names[j]
if self._column_types[j] == b"d":
col_arr = self._byte_chunk[jb, :].view(dtype=self.byte_order + "d")
rslt[name] = pd.Series(col_arr, dtype=np.float64, index=ix, copy=False)
if self.convert_dates:
if self.column_formats[j] in const.sas_date_formats:
rslt[name] = _convert_datetimes(rslt[name], "d")
elif self.column_formats[j] in const.sas_datetime_formats:
rslt[name] = _convert_datetimes(rslt[name], "s")
jb += 1
elif self._column_types[j] == b"s":
rslt[name] = pd.Series(self._string_chunk[js, :], index=ix, copy=False)
if self.convert_text and (self.encoding is not None):
rslt[name] = self._decode_string(rslt[name].str)
js += 1
else:
self.close()
raise ValueError(f"unknown column type {repr(self._column_types[j])}")
df = DataFrame(rslt, columns=self.column_names, index=ix, copy=False)
return df
def _decode_string(self, b):
return b.decode(self.encoding or self.default_encoding)
def _convert_header_text(self, b: bytes) -> str | bytes:
if self.convert_header_text:
return self._decode_string(b)
else:
return b

View File

@ -0,0 +1,310 @@
from __future__ import annotations
from typing import Final
magic: Final = (
b"\x00\x00\x00\x00\x00\x00\x00\x00"
b"\x00\x00\x00\x00\xc2\xea\x81\x60"
b"\xb3\x14\x11\xcf\xbd\x92\x08\x00"
b"\x09\xc7\x31\x8c\x18\x1f\x10\x11"
)
align_1_checker_value: Final = b"3"
align_1_offset: Final = 32
align_1_length: Final = 1
align_1_value: Final = 4
u64_byte_checker_value: Final = b"3"
align_2_offset: Final = 35
align_2_length: Final = 1
align_2_value: Final = 4
endianness_offset: Final = 37
endianness_length: Final = 1
platform_offset: Final = 39
platform_length: Final = 1
encoding_offset: Final = 70
encoding_length: Final = 1
dataset_offset: Final = 92
dataset_length: Final = 64
file_type_offset: Final = 156
file_type_length: Final = 8
date_created_offset: Final = 164
date_created_length: Final = 8
date_modified_offset: Final = 172
date_modified_length: Final = 8
header_size_offset: Final = 196
header_size_length: Final = 4
page_size_offset: Final = 200
page_size_length: Final = 4
page_count_offset: Final = 204
page_count_length: Final = 4
sas_release_offset: Final = 216
sas_release_length: Final = 8
sas_server_type_offset: Final = 224
sas_server_type_length: Final = 16
os_version_number_offset: Final = 240
os_version_number_length: Final = 16
os_maker_offset: Final = 256
os_maker_length: Final = 16
os_name_offset: Final = 272
os_name_length: Final = 16
page_bit_offset_x86: Final = 16
page_bit_offset_x64: Final = 32
subheader_pointer_length_x86: Final = 12
subheader_pointer_length_x64: Final = 24
page_type_offset: Final = 0
page_type_length: Final = 2
block_count_offset: Final = 2
block_count_length: Final = 2
subheader_count_offset: Final = 4
subheader_count_length: Final = 2
page_type_mask: Final = 0x0F00
# Keep "page_comp_type" bits
page_type_mask2: Final = 0xF000 | page_type_mask
page_meta_type: Final = 0x0000
page_data_type: Final = 0x0100
page_mix_type: Final = 0x0200
page_amd_type: Final = 0x0400
page_meta2_type: Final = 0x4000
page_comp_type: Final = 0x9000
page_meta_types: Final = [page_meta_type, page_meta2_type]
subheader_pointers_offset: Final = 8
truncated_subheader_id: Final = 1
compressed_subheader_id: Final = 4
compressed_subheader_type: Final = 1
text_block_size_length: Final = 2
row_length_offset_multiplier: Final = 5
row_count_offset_multiplier: Final = 6
col_count_p1_multiplier: Final = 9
col_count_p2_multiplier: Final = 10
row_count_on_mix_page_offset_multiplier: Final = 15
column_name_pointer_length: Final = 8
column_name_text_subheader_offset: Final = 0
column_name_text_subheader_length: Final = 2
column_name_offset_offset: Final = 2
column_name_offset_length: Final = 2
column_name_length_offset: Final = 4
column_name_length_length: Final = 2
column_data_offset_offset: Final = 8
column_data_length_offset: Final = 8
column_data_length_length: Final = 4
column_type_offset: Final = 14
column_type_length: Final = 1
column_format_text_subheader_index_offset: Final = 22
column_format_text_subheader_index_length: Final = 2
column_format_offset_offset: Final = 24
column_format_offset_length: Final = 2
column_format_length_offset: Final = 26
column_format_length_length: Final = 2
column_label_text_subheader_index_offset: Final = 28
column_label_text_subheader_index_length: Final = 2
column_label_offset_offset: Final = 30
column_label_offset_length: Final = 2
column_label_length_offset: Final = 32
column_label_length_length: Final = 2
rle_compression: Final = b"SASYZCRL"
rdc_compression: Final = b"SASYZCR2"
compression_literals: Final = [rle_compression, rdc_compression]
# Incomplete list of encodings, using SAS nomenclature:
# https://support.sas.com/documentation/onlinedoc/dfdmstudio/2.6/dmpdmsug/Content/dfU_Encodings_SAS.html
# corresponding to the Python documentation of standard encodings
# https://docs.python.org/3/library/codecs.html#standard-encodings
encoding_names: Final = {
20: "utf-8",
29: "latin1",
30: "latin2",
31: "latin3",
32: "latin4",
33: "cyrillic",
34: "arabic",
35: "greek",
36: "hebrew",
37: "latin5",
38: "latin6",
39: "cp874",
40: "latin9",
41: "cp437",
42: "cp850",
43: "cp852",
44: "cp857",
45: "cp858",
46: "cp862",
47: "cp864",
48: "cp865",
49: "cp866",
50: "cp869",
51: "cp874",
# 52: "", # not found
# 53: "", # not found
# 54: "", # not found
55: "cp720",
56: "cp737",
57: "cp775",
58: "cp860",
59: "cp863",
60: "cp1250",
61: "cp1251",
62: "cp1252",
63: "cp1253",
64: "cp1254",
65: "cp1255",
66: "cp1256",
67: "cp1257",
68: "cp1258",
118: "cp950",
# 119: "", # not found
123: "big5",
125: "gb2312",
126: "cp936",
134: "euc_jp",
136: "cp932",
138: "shift_jis",
140: "euc-kr",
141: "cp949",
227: "latin8",
# 228: "", # not found
# 229: "" # not found
}
class SASIndex:
row_size_index: Final = 0
column_size_index: Final = 1
subheader_counts_index: Final = 2
column_text_index: Final = 3
column_name_index: Final = 4
column_attributes_index: Final = 5
format_and_label_index: Final = 6
column_list_index: Final = 7
data_subheader_index: Final = 8
subheader_signature_to_index: Final = {
b"\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
b"\x00\x00\x00\x00\xF7\xF7\xF7\xF7": SASIndex.row_size_index,
b"\xF7\xF7\xF7\xF7\x00\x00\x00\x00": SASIndex.row_size_index,
b"\xF7\xF7\xF7\xF7\xFF\xFF\xFB\xFE": SASIndex.row_size_index,
b"\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
b"\x00\x00\x00\x00\xF6\xF6\xF6\xF6": SASIndex.column_size_index,
b"\xF6\xF6\xF6\xF6\x00\x00\x00\x00": SASIndex.column_size_index,
b"\xF6\xF6\xF6\xF6\xFF\xFF\xFB\xFE": SASIndex.column_size_index,
b"\x00\xFC\xFF\xFF": SASIndex.subheader_counts_index,
b"\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
b"\x00\xFC\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.subheader_counts_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFC\x00": SASIndex.subheader_counts_index,
b"\xFD\xFF\xFF\xFF": SASIndex.column_text_index,
b"\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
b"\xFD\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_text_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFD": SASIndex.column_text_index,
b"\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_name_index,
b"\xFC\xFF\xFF\xFF": SASIndex.column_attributes_index,
b"\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
b"\xFC\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_attributes_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFC": SASIndex.column_attributes_index,
b"\xFE\xFB\xFF\xFF": SASIndex.format_and_label_index,
b"\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
b"\xFE\xFB\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.format_and_label_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFB\xFE": SASIndex.format_and_label_index,
b"\xFE\xFF\xFF\xFF": SASIndex.column_list_index,
b"\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
b"\xFE\xFF\xFF\xFF\xFF\xFF\xFF\xFF": SASIndex.column_list_index,
b"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFE": SASIndex.column_list_index,
}
# List of frequently used SAS date and datetime formats
# http://support.sas.com/documentation/cdl/en/etsug/60372/HTML/default/viewer.htm#etsug_intervals_sect009.htm
# https://github.com/epam/parso/blob/master/src/main/java/com/epam/parso/impl/SasFileConstants.java
sas_date_formats: Final = (
"DATE",
"DAY",
"DDMMYY",
"DOWNAME",
"JULDAY",
"JULIAN",
"MMDDYY",
"MMYY",
"MMYYC",
"MMYYD",
"MMYYP",
"MMYYS",
"MMYYN",
"MONNAME",
"MONTH",
"MONYY",
"QTR",
"QTRR",
"NENGO",
"WEEKDATE",
"WEEKDATX",
"WEEKDAY",
"WEEKV",
"WORDDATE",
"WORDDATX",
"YEAR",
"YYMM",
"YYMMC",
"YYMMD",
"YYMMP",
"YYMMS",
"YYMMN",
"YYMON",
"YYMMDD",
"YYQ",
"YYQC",
"YYQD",
"YYQP",
"YYQS",
"YYQN",
"YYQR",
"YYQRC",
"YYQRD",
"YYQRP",
"YYQRS",
"YYQRN",
"YYMMDDP",
"YYMMDDC",
"E8601DA",
"YYMMDDN",
"MMDDYYC",
"MMDDYYS",
"MMDDYYD",
"YYMMDDS",
"B8601DA",
"DDMMYYN",
"YYMMDDD",
"DDMMYYB",
"DDMMYYP",
"MMDDYYP",
"YYMMDDB",
"MMDDYYN",
"DDMMYYC",
"DDMMYYD",
"DDMMYYS",
"MINGUO",
)
sas_datetime_formats: Final = (
"DATETIME",
"DTWKDATX",
"B8601DN",
"B8601DT",
"B8601DX",
"B8601DZ",
"B8601LX",
"E8601DN",
"E8601DT",
"E8601DX",
"E8601DZ",
"E8601LX",
"DATEAMPM",
"DTDATE",
"DTMONYY",
"DTMONYY",
"DTWKDATX",
"DTYEAR",
"TOD",
"MDYAMPM",
)

View File

@ -0,0 +1,508 @@
"""
Read a SAS XPort format file into a Pandas DataFrame.
Based on code from Jack Cushman (github.com/jcushman/xport).
The file format is defined here:
https://support.sas.com/content/dam/SAS/support/en/technical-papers/record-layout-of-a-sas-version-5-or-6-data-set-in-sas-transport-xport-format.pdf
"""
from __future__ import annotations
from collections import abc
from datetime import datetime
import struct
from typing import TYPE_CHECKING
import warnings
import numpy as np
from pandas.util._decorators import Appender
from pandas.util._exceptions import find_stack_level
import pandas as pd
from pandas.io.common import get_handle
from pandas.io.sas.sasreader import ReaderBase
if TYPE_CHECKING:
from pandas._typing import (
CompressionOptions,
DatetimeNaTType,
FilePath,
ReadBuffer,
)
_correct_line1 = (
"HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!"
"000000000000000000000000000000 "
)
_correct_header1 = (
"HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!000000000000000001600000000"
)
_correct_header2 = (
"HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!"
"000000000000000000000000000000 "
)
_correct_obs_header = (
"HEADER RECORD*******OBS HEADER RECORD!!!!!!!"
"000000000000000000000000000000 "
)
_fieldkeys = [
"ntype",
"nhfun",
"field_length",
"nvar0",
"name",
"label",
"nform",
"nfl",
"num_decimals",
"nfj",
"nfill",
"niform",
"nifl",
"nifd",
"npos",
"_",
]
_base_params_doc = """\
Parameters
----------
filepath_or_buffer : str or file-like object
Path to SAS file or object implementing binary read method."""
_params2_doc = """\
index : identifier of index column
Identifier of column that should be used as index of the DataFrame.
encoding : str
Encoding for text data.
chunksize : int
Read file `chunksize` lines at a time, returns iterator."""
_format_params_doc = """\
format : str
File format, only `xport` is currently supported."""
_iterator_doc = """\
iterator : bool, default False
Return XportReader object for reading file incrementally."""
_read_sas_doc = f"""Read a SAS file into a DataFrame.
{_base_params_doc}
{_format_params_doc}
{_params2_doc}
{_iterator_doc}
Returns
-------
DataFrame or XportReader
Examples
--------
Read a SAS Xport file:
>>> df = pd.read_sas('filename.XPT')
Read a Xport file in 10,000 line chunks:
>>> itr = pd.read_sas('filename.XPT', chunksize=10000)
>>> for chunk in itr:
>>> do_something(chunk)
"""
_xport_reader_doc = f"""\
Class for reading SAS Xport files.
{_base_params_doc}
{_params2_doc}
Attributes
----------
member_info : list
Contains information about the file
fields : list
Contains information about the variables in the file
"""
_read_method_doc = """\
Read observations from SAS Xport file, returning as data frame.
Parameters
----------
nrows : int
Number of rows to read from data file; if None, read whole
file.
Returns
-------
A DataFrame.
"""
def _parse_date(datestr: str) -> DatetimeNaTType:
"""Given a date in xport format, return Python date."""
try:
# e.g. "16FEB11:10:07:55"
return datetime.strptime(datestr, "%d%b%y:%H:%M:%S")
except ValueError:
return pd.NaT
def _split_line(s: str, parts):
"""
Parameters
----------
s: str
Fixed-length string to split
parts: list of (name, length) pairs
Used to break up string, name '_' will be filtered from output.
Returns
-------
Dict of name:contents of string at given location.
"""
out = {}
start = 0
for name, length in parts:
out[name] = s[start : start + length].strip()
start += length
del out["_"]
return out
def _handle_truncated_float_vec(vec, nbytes):
# This feature is not well documented, but some SAS XPORT files
# have 2-7 byte "truncated" floats. To read these truncated
# floats, pad them with zeros on the right to make 8 byte floats.
#
# References:
# https://github.com/jcushman/xport/pull/3
# The R "foreign" library
if nbytes != 8:
vec1 = np.zeros(len(vec), np.dtype("S8"))
dtype = np.dtype(f"S{nbytes},S{8 - nbytes}")
vec2 = vec1.view(dtype=dtype)
vec2["f0"] = vec
return vec2
return vec
def _parse_float_vec(vec):
"""
Parse a vector of float values representing IBM 8 byte floats into
native 8 byte floats.
"""
dtype = np.dtype(">u4,>u4")
vec1 = vec.view(dtype=dtype)
xport1 = vec1["f0"]
xport2 = vec1["f1"]
# Start by setting first half of ieee number to first half of IBM
# number sans exponent
ieee1 = xport1 & 0x00FFFFFF
# The fraction bit to the left of the binary point in the ieee
# format was set and the number was shifted 0, 1, 2, or 3
# places. This will tell us how to adjust the ibm exponent to be a
# power of 2 ieee exponent and how to shift the fraction bits to
# restore the correct magnitude.
shift = np.zeros(len(vec), dtype=np.uint8)
shift[np.where(xport1 & 0x00200000)] = 1
shift[np.where(xport1 & 0x00400000)] = 2
shift[np.where(xport1 & 0x00800000)] = 3
# shift the ieee number down the correct number of places then
# set the second half of the ieee number to be the second half
# of the ibm number shifted appropriately, ored with the bits
# from the first half that would have been shifted in if we
# could shift a double. All we are worried about are the low
# order 3 bits of the first half since we're only shifting by
# 1, 2, or 3.
ieee1 >>= shift
ieee2 = (xport2 >> shift) | ((xport1 & 0x00000007) << (29 + (3 - shift)))
# clear the 1 bit to the left of the binary point
ieee1 &= 0xFFEFFFFF
# set the exponent of the ieee number to be the actual exponent
# plus the shift count + 1023. Or this into the first half of the
# ieee number. The ibm exponent is excess 64 but is adjusted by 65
# since during conversion to ibm format the exponent is
# incremented by 1 and the fraction bits left 4 positions to the
# right of the radix point. (had to add >> 24 because C treats &
# 0x7f as 0x7f000000 and Python doesn't)
ieee1 |= ((((((xport1 >> 24) & 0x7F) - 65) << 2) + shift + 1023) << 20) | (
xport1 & 0x80000000
)
ieee = np.empty((len(ieee1),), dtype=">u4,>u4")
ieee["f0"] = ieee1
ieee["f1"] = ieee2
ieee = ieee.view(dtype=">f8")
ieee = ieee.astype("f8")
return ieee
class XportReader(ReaderBase, abc.Iterator):
__doc__ = _xport_reader_doc
def __init__(
self,
filepath_or_buffer: FilePath | ReadBuffer[bytes],
index=None,
encoding: str | None = "ISO-8859-1",
chunksize: int | None = None,
compression: CompressionOptions = "infer",
) -> None:
self._encoding = encoding
self._lines_read = 0
self._index = index
self._chunksize = chunksize
self.handles = get_handle(
filepath_or_buffer,
"rb",
encoding=encoding,
is_text=False,
compression=compression,
)
self.filepath_or_buffer = self.handles.handle
try:
self._read_header()
except Exception:
self.close()
raise
def close(self) -> None:
self.handles.close()
def _get_row(self):
return self.filepath_or_buffer.read(80).decode()
def _read_header(self) -> None:
self.filepath_or_buffer.seek(0)
# read file header
line1 = self._get_row()
if line1 != _correct_line1:
if "**COMPRESSED**" in line1:
# this was created with the PROC CPORT method and can't be read
# https://documentation.sas.com/doc/en/pgmsascdc/9.4_3.5/movefile/p1bm6aqp3fw4uin1hucwh718f6kp.htm
raise ValueError(
"Header record indicates a CPORT file, which is not readable."
)
raise ValueError("Header record is not an XPORT file.")
line2 = self._get_row()
fif = [["prefix", 24], ["version", 8], ["OS", 8], ["_", 24], ["created", 16]]
file_info = _split_line(line2, fif)
if file_info["prefix"] != "SAS SAS SASLIB":
raise ValueError("Header record has invalid prefix.")
file_info["created"] = _parse_date(file_info["created"])
self.file_info = file_info
line3 = self._get_row()
file_info["modified"] = _parse_date(line3[:16])
# read member header
header1 = self._get_row()
header2 = self._get_row()
headflag1 = header1.startswith(_correct_header1)
headflag2 = header2 == _correct_header2
if not (headflag1 and headflag2):
raise ValueError("Member header not found")
# usually 140, could be 135
fieldnamelength = int(header1[-5:-2])
# member info
mem = [
["prefix", 8],
["set_name", 8],
["sasdata", 8],
["version", 8],
["OS", 8],
["_", 24],
["created", 16],
]
member_info = _split_line(self._get_row(), mem)
mem = [["modified", 16], ["_", 16], ["label", 40], ["type", 8]]
member_info.update(_split_line(self._get_row(), mem))
member_info["modified"] = _parse_date(member_info["modified"])
member_info["created"] = _parse_date(member_info["created"])
self.member_info = member_info
# read field names
types = {1: "numeric", 2: "char"}
fieldcount = int(self._get_row()[54:58])
datalength = fieldnamelength * fieldcount
# round up to nearest 80
if datalength % 80:
datalength += 80 - datalength % 80
fielddata = self.filepath_or_buffer.read(datalength)
fields = []
obs_length = 0
while len(fielddata) >= fieldnamelength:
# pull data for one field
fieldbytes, fielddata = (
fielddata[:fieldnamelength],
fielddata[fieldnamelength:],
)
# rest at end gets ignored, so if field is short, pad out
# to match struct pattern below
fieldbytes = fieldbytes.ljust(140)
fieldstruct = struct.unpack(">hhhh8s40s8shhh2s8shhl52s", fieldbytes)
field = dict(zip(_fieldkeys, fieldstruct))
del field["_"]
field["ntype"] = types[field["ntype"]]
fl = field["field_length"]
if field["ntype"] == "numeric" and ((fl < 2) or (fl > 8)):
msg = f"Floating field width {fl} is not between 2 and 8."
raise TypeError(msg)
for k, v in field.items():
try:
field[k] = v.strip()
except AttributeError:
pass
obs_length += field["field_length"]
fields += [field]
header = self._get_row()
if not header == _correct_obs_header:
raise ValueError("Observation header not found.")
self.fields = fields
self.record_length = obs_length
self.record_start = self.filepath_or_buffer.tell()
self.nobs = self._record_count()
self.columns = [x["name"].decode() for x in self.fields]
# Setup the dtype.
dtypel = [
("s" + str(i), "S" + str(field["field_length"]))
for i, field in enumerate(self.fields)
]
dtype = np.dtype(dtypel)
self._dtype = dtype
def __next__(self) -> pd.DataFrame:
return self.read(nrows=self._chunksize or 1)
def _record_count(self) -> int:
"""
Get number of records in file.
This is maybe suboptimal because we have to seek to the end of
the file.
Side effect: returns file position to record_start.
"""
self.filepath_or_buffer.seek(0, 2)
total_records_length = self.filepath_or_buffer.tell() - self.record_start
if total_records_length % 80 != 0:
warnings.warn(
"xport file may be corrupted.",
stacklevel=find_stack_level(),
)
if self.record_length > 80:
self.filepath_or_buffer.seek(self.record_start)
return total_records_length // self.record_length
self.filepath_or_buffer.seek(-80, 2)
last_card_bytes = self.filepath_or_buffer.read(80)
last_card = np.frombuffer(last_card_bytes, dtype=np.uint64)
# 8 byte blank
ix = np.flatnonzero(last_card == 2314885530818453536)
if len(ix) == 0:
tail_pad = 0
else:
tail_pad = 8 * len(ix)
self.filepath_or_buffer.seek(self.record_start)
return (total_records_length - tail_pad) // self.record_length
def get_chunk(self, size: int | None = None) -> pd.DataFrame:
"""
Reads lines from Xport file and returns as dataframe
Parameters
----------
size : int, defaults to None
Number of lines to read. If None, reads whole file.
Returns
-------
DataFrame
"""
if size is None:
size = self._chunksize
return self.read(nrows=size)
def _missing_double(self, vec):
v = vec.view(dtype="u1,u1,u2,u4")
miss = (v["f1"] == 0) & (v["f2"] == 0) & (v["f3"] == 0)
miss1 = (
((v["f0"] >= 0x41) & (v["f0"] <= 0x5A))
| (v["f0"] == 0x5F)
| (v["f0"] == 0x2E)
)
miss &= miss1
return miss
@Appender(_read_method_doc)
def read(self, nrows: int | None = None) -> pd.DataFrame:
if nrows is None:
nrows = self.nobs
read_lines = min(nrows, self.nobs - self._lines_read)
read_len = read_lines * self.record_length
if read_len <= 0:
self.close()
raise StopIteration
raw = self.filepath_or_buffer.read(read_len)
data = np.frombuffer(raw, dtype=self._dtype, count=read_lines)
df_data = {}
for j, x in enumerate(self.columns):
vec = data["s" + str(j)]
ntype = self.fields[j]["ntype"]
if ntype == "numeric":
vec = _handle_truncated_float_vec(vec, self.fields[j]["field_length"])
miss = self._missing_double(vec)
v = _parse_float_vec(vec)
v[miss] = np.nan
elif self.fields[j]["ntype"] == "char":
v = [y.rstrip() for y in vec]
if self._encoding is not None:
v = [y.decode(self._encoding) for y in v]
df_data.update({x: v})
df = pd.DataFrame(df_data)
if self._index is None:
df.index = pd.Index(range(self._lines_read, self._lines_read + read_lines))
else:
df = df.set_index(self._index)
self._lines_read += read_lines
return df

View File

@ -0,0 +1,178 @@
"""
Read SAS sas7bdat or xport files.
"""
from __future__ import annotations
from abc import (
ABC,
abstractmethod,
)
from typing import (
TYPE_CHECKING,
overload,
)
from pandas.util._decorators import doc
from pandas.core.shared_docs import _shared_docs
from pandas.io.common import stringify_path
if TYPE_CHECKING:
from collections.abc import Hashable
from types import TracebackType
from pandas._typing import (
CompressionOptions,
FilePath,
ReadBuffer,
Self,
)
from pandas import DataFrame
class ReaderBase(ABC):
"""
Protocol for XportReader and SAS7BDATReader classes.
"""
@abstractmethod
def read(self, nrows: int | None = None) -> DataFrame:
...
@abstractmethod
def close(self) -> None:
...
def __enter__(self) -> Self:
return self
def __exit__(
self,
exc_type: type[BaseException] | None,
exc_value: BaseException | None,
traceback: TracebackType | None,
) -> None:
self.close()
@overload
def read_sas(
filepath_or_buffer: FilePath | ReadBuffer[bytes],
*,
format: str | None = ...,
index: Hashable | None = ...,
encoding: str | None = ...,
chunksize: int = ...,
iterator: bool = ...,
compression: CompressionOptions = ...,
) -> ReaderBase:
...
@overload
def read_sas(
filepath_or_buffer: FilePath | ReadBuffer[bytes],
*,
format: str | None = ...,
index: Hashable | None = ...,
encoding: str | None = ...,
chunksize: None = ...,
iterator: bool = ...,
compression: CompressionOptions = ...,
) -> DataFrame | ReaderBase:
...
@doc(decompression_options=_shared_docs["decompression_options"] % "filepath_or_buffer")
def read_sas(
filepath_or_buffer: FilePath | ReadBuffer[bytes],
*,
format: str | None = None,
index: Hashable | None = None,
encoding: str | None = None,
chunksize: int | None = None,
iterator: bool = False,
compression: CompressionOptions = "infer",
) -> DataFrame | ReaderBase:
"""
Read SAS files stored as either XPORT or SAS7BDAT format files.
Parameters
----------
filepath_or_buffer : str, path object, or file-like object
String, path object (implementing ``os.PathLike[str]``), or file-like
object implementing a binary ``read()`` function. The string could be a URL.
Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
expected. A local file could be:
``file://localhost/path/to/table.sas7bdat``.
format : str {{'xport', 'sas7bdat'}} or None
If None, file format is inferred from file extension. If 'xport' or
'sas7bdat', uses the corresponding format.
index : identifier of index column, defaults to None
Identifier of column that should be used as index of the DataFrame.
encoding : str, default is None
Encoding for text data. If None, text data are stored as raw bytes.
chunksize : int
Read file `chunksize` lines at a time, returns iterator.
iterator : bool, defaults to False
If True, returns an iterator for reading the file incrementally.
{decompression_options}
Returns
-------
DataFrame if iterator=False and chunksize=None, else SAS7BDATReader
or XportReader
Examples
--------
>>> df = pd.read_sas("sas_data.sas7bdat") # doctest: +SKIP
"""
if format is None:
buffer_error_msg = (
"If this is a buffer object rather "
"than a string name, you must specify a format string"
)
filepath_or_buffer = stringify_path(filepath_or_buffer)
if not isinstance(filepath_or_buffer, str):
raise ValueError(buffer_error_msg)
fname = filepath_or_buffer.lower()
if ".xpt" in fname:
format = "xport"
elif ".sas7bdat" in fname:
format = "sas7bdat"
else:
raise ValueError(
f"unable to infer format of SAS file from filename: {repr(fname)}"
)
reader: ReaderBase
if format.lower() == "xport":
from pandas.io.sas.sas_xport import XportReader
reader = XportReader(
filepath_or_buffer,
index=index,
encoding=encoding,
chunksize=chunksize,
compression=compression,
)
elif format.lower() == "sas7bdat":
from pandas.io.sas.sas7bdat import SAS7BDATReader
reader = SAS7BDATReader(
filepath_or_buffer,
index=index,
encoding=encoding,
chunksize=chunksize,
compression=compression,
)
else:
raise ValueError("unknown SAS format")
if iterator or chunksize:
return reader
with reader:
return reader.read()