Smarter default client lookup
Currently, the default client is constructed purely based on the implementation class and not on the path.
It would be nice if there was a way to be able to register credential arguments based on the path and construct clients as needed for the given location.
For instance, different S3 buckets may require different access keys or profiles, and different Azure accounts or containers may have different access keys.
The registry could simply be a table mapping regular expressions to a set of arguments to pass to the corresponding client constructor:
CloudClient.register_default('s3://my-bucket/' : dict(profile_name = 'my-profile'))
CloudClient.register_default('s3://bobs-bucket/' : dict(profile_name = 'bobs-profile'))
CloudClient.register_default('azure://acme/widgets/' : dict(credential = '?...')) # where acme is the account name
This would save the user from the chore of manually creating and passing around clients when working with paths that cannot be accessed using a single client of a given type.
FYI, here is a similar feature I added to the AzurePath hack I posted on #157:
@register_path_class('azure2')
class AzurePath(AzureBlobPath):
"""CloudPath for accessing Azure blob/dfs storage using azure:// URL scheme."""
client: 'AzureClient'
cloud_prefix: str = "azure://"
__token_registry: Dict[Pattern,str] = {}
@classmethod
def register_token(cls, pattern: Union[str, Pattern], token: str) -> None:
"""Registers a token for URLs matching given regular expression pattern.
Arguments:
pattern: a regular expression that will be matched against the normalized path URL, which
will be of the form "azure://<account>/<container>..."
token: the SAS token to use for accessing paths that match the expression.
"""
cls.__token_registry[re.compile(pattern)] = token
@classmethod
def unregister_tokens(cls, name:Union[str, CloudPath]) -> int:
"""Removes all registry entries that match name
Returns:
number of entries removed
"""
s = str(name)
keys = list(pattern for pattern in cls.__token_registry if pattern.match(s))
for key in keys:
cls.__token_registry.pop(key)
return len(keys)
@classmethod
def clear_tokens(cls) -> int:
"""Clears the token registry.
Returns:
Number of tokens that were removed.
"""
n = len(cls.__token_registry)
cls.__token_registry.clear()
return n
def __init__(self, cloud_path: Union[str, CloudPath],
client: Optional[AzureBlobClient] = None,
token: Optional[str] = None
):
"""Constructs new AzurePath instance
Arguments:
cloud_path: the resource path. May either be an existing AzurePath,
a string of the form "azure://<account>/<container>..." or
a URL of the form "https://<account>.blob.core.windows.net/<container>...".
client: the client to use with this path. The client will be ignored if a
token is provided or the input path has a query string.
token: the SAS token to use to access this path. This will override any
token in the path's query string.
"""
if isinstance(cloud_path, str):
parsed = urlparse(cloud_path)
m = re.match(r'(?P<account>[a-z0-9]+)(\.(?P<type>blob|dfs)(\.core\.windows\.net)?)?',
parsed.netloc,
flags=re.IGNORECASE)
if m is None:
raise ValueError(f"Bad azure path '{cloud_path}'")
account = m.group('account')
fstype = m.group('type') or 'blob'
account_url = f'https://{account}.{fstype}.core.windows.net/'
optional_type = '' if fstype == 'blob' else '.' + fstype
cloud_path = f"azure://{account}{optional_type}/{parsed.path.lstrip('/')}"
if client is None or parsed.query or token or client.service_client.account_name != account:
if token is not None:
token = '?' + token.lstrip('?')
elif parsed.query:
token = '?' + parsed.query
else:
for pattern, _token in self.__token_registry.items():
if pattern.match(cloud_path):
token = _token
client = AzureClient(account_url, token)
super().__init__(cloud_path, client = client)
@classmethod
def is_valid_cloudpath(cls, path: Union[str, CloudPath], raise_on_error=False) -> bool:
"""True either if this looks like an Azure blob/dfs path.
Specifically, it either starts with azure:// or is an http:// URL of the form
https://<account>.(blob|dfs).core.windows.net/...
"""
valid = bool(re.match(r'(azure://|https://[a-z0-9]+\.(blob|dfs)\.core\.windows\.net)', str(path).lower()))
if raise_on_error and not valid:
raise InvalidPrefixError(
f"'{path}' is not a valid path since it does not start with '{cls.cloud_prefix}' "
"or valid Azure https blob or dfs location."
)
return valid
@property
def container(self) -> str:
return self._no_prefix.split('/',2)[1]
@property
def blob(self) -> str:
return super().blob.split('/',1)[1```