Tried accesing a file stored in a Azure Data Lake Storage using pandas read_csv method. But facing issue as the folder path is interpreted as a list. Is there any way to explicity specify pandas that its a path and not list?
The issue is with the "Type – [‘D’]" mentioned in the path
import pandas as pd
datalake_connection_string = "<connection_string_for_the_container>"
data=pd.read_csv(f"abfs://container_name@storage_account_name.blob.core.windows.net/OutputFiles/CodeOutputs/Type - ['D']/SOLUTIONS/summary.csv",storage_options={"connection_string": datalake_connection_string})
The Error I’m facing is as below:
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
Input In [14], in <cell line: 1>()
----> 1 data=pd.read_csv(f"abfs://container_name@storage_account_name.blob.core.windows.net/OutputFiles/CodeOutputs/Type - ['D']/SOLUTIONS/summary.csv",storage_options={"connection_string": ADLSConnectionString})
File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/pandas/util/_decorators.py:211, in deprecate_kwarg.<locals>._deprecate_kwarg.<locals>.wrapper(*args, **kwargs)
209 else:
210 kwargs[new_arg_name] = new_arg_value
--> 211 return func(*args, **kwargs)
File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/pandas/util/_decorators.py:331, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
325 if len(args) > num_allow_args:
326 warnings.warn(
327 msg.format(arguments=_format_argument_list(allow_args)),
328 FutureWarning,
329 stacklevel=find_stack_level(),
330 )
--> 331 return func(*args, **kwargs)
File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:950, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
935 kwds_defaults = _refine_defaults_read(
936 dialect,
937 delimiter,
(...)
946 defaults={"delimiter": ","},
947 )
948 kwds.update(kwds_defaults)
--> 950 return _read(filepath_or_buffer, kwds)
File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:605, in _read(filepath_or_buffer, kwds)
602 _validate_names(kwds.get("names", None))
604 # Create the parser.
--> 605 parser = TextFileReader(filepath_or_buffer, **kwds)
607 if chunksize or iterator:
608 return parser
File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1442, in TextFileReader.__init__(self, f, engine, **kwds)
1439 self.options["has_index_names"] = kwds["has_index_names"]
1441 self.handles: IOHandles | None = None
-> 1442 self._engine = self._make_engine(f, self.engine)
File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/pandas/io/parsers/readers.py:1735, in TextFileReader._make_engine(self, f, engine)
1733 if "b" not in mode:
1734 mode += "b"
-> 1735 self.handles = get_handle(
1736 f,
1737 mode,
1738 encoding=self.options.get("encoding", None),
1739 compression=self.options.get("compression", None),
1740 memory_map=self.options.get("memory_map", False),
1741 is_text=is_text,
1742 errors=self.options.get("encoding_errors", "strict"),
1743 storage_options=self.options.get("storage_options", None),
1744 )
1745 assert self.handles is not None
1746 f = self.handles.handle
File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/pandas/io/common.py:713, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
710 codecs.lookup_error(errors)
712 # open URLs
--> 713 ioargs = _get_filepath_or_buffer(
714 path_or_buf,
715 encoding=encoding,
716 compression=compression,
717 mode=mode,
718 storage_options=storage_options,
719 )
721 handle = ioargs.filepath_or_buffer
722 handles: list[BaseBuffer]
File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/pandas/io/common.py:409, in _get_filepath_or_buffer(filepath_or_buffer, encoding, compression, mode, storage_options)
406 pass
408 try:
--> 409 file_obj = fsspec.open(
410 filepath_or_buffer, mode=fsspec_mode, **(storage_options or {})
411 ).open()
412 # GH 34626 Reads from Public Buckets without Credentials needs anon=True
413 except tuple(err_types_to_retry_with_anon):
File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/fsspec/core.py:441, in open(urlpath, mode, compression, encoding, errors, protocol, newline, **kwargs)
391 def open(
392 urlpath,
393 mode="rb",
(...)
399 **kwargs,
400 ):
401 """Given a path or paths, return one ``OpenFile`` object.
402
403 Parameters
(...)
439 ``OpenFile`` object.
440 """
--> 441 return open_files(
442 urlpath=[urlpath],
443 mode=mode,
444 compression=compression,
445 encoding=encoding,
446 errors=errors,
447 protocol=protocol,
448 newline=newline,
449 expand=False,
450 **kwargs,
451 )[0]
File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/fsspec/core.py:195, in OpenFiles.__getitem__(self, item)
194 def __getitem__(self, item):
--> 195 out = super().__getitem__(item)
196 if isinstance(item, slice):
197 return OpenFiles(out, mode=self.mode, fs=self.fs)
IndexError: list index out of range
Can any one please suggest a workaround for this?
2
Answers
I’d try to break it down using something like os.path.join. This worked for me (on linux):
I tried in my environment and got the below results:
You can use the below code to read a file from a blob using pandas if it has a list-like directory structure inside it.
In that structure the
Type - ['D']
for each quotation mark there should be a backward slash to make it redirect to the right path.You can get the Blob SAS Url from the portal:
Code:
Output:
Reference:
Naming and Referencing Containers, Blobs, and Metadata – Azure Storage | Microsoft Learn