airbyte_cdk.sources.declarative.parsers.manifest_reference_resolver

  1#
  2# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
  3#
  4
  5import re
  6from typing import Any, Mapping, Set, Tuple, Union
  7
  8from airbyte_cdk.sources.declarative.parsers.custom_exceptions import (
  9    CircularReferenceException,
 10    UndefinedReferenceException,
 11)
 12
 13REF_TAG = "$ref"
 14
 15
 16class ManifestReferenceResolver:
 17    """
 18    An incoming manifest can contain references to values previously defined.
 19    This parser will dereference these values to produce a complete ConnectionDefinition.
 20
 21    References can be defined using a #/<arg> string.
 22    ```
 23    key: 1234
 24    reference: "#/key"
 25    ```
 26    will produce the following definition:
 27    ```
 28    key: 1234
 29    reference: 1234
 30    ```
 31    This also works with objects:
 32    ```
 33    key_value_pairs:
 34      k1: v1
 35      k2: v2
 36    same_key_value_pairs: "#/key_value_pairs"
 37    ```
 38    will produce the following definition:
 39    ```
 40    key_value_pairs:
 41      k1: v1
 42      k2: v2
 43    same_key_value_pairs:
 44      k1: v1
 45      k2: v2
 46    ```
 47
 48    The $ref keyword can be used to refer to an object and enhance it with addition key-value pairs
 49    ```
 50    key_value_pairs:
 51      k1: v1
 52      k2: v2
 53    same_key_value_pairs:
 54      $ref: "#/key_value_pairs"
 55      k3: v3
 56    ```
 57    will produce the following definition:
 58    ```
 59    key_value_pairs:
 60      k1: v1
 61      k2: v2
 62    same_key_value_pairs:
 63      k1: v1
 64      k2: v2
 65      k3: v3
 66    ```
 67
 68    References can also point to nested values.
 69    Nested references are ambiguous because one could define a key containing with `.`
 70    in this example, we want to refer to the limit key in the dict object:
 71    ```
 72    dict:
 73        limit: 50
 74    limit_ref: "#/dict/limit"
 75    ```
 76    will produce the following definition:
 77    ```
 78    dict
 79        limit: 50
 80    limit-ref: 50
 81    ```
 82
 83    whereas here we want to access the `nested/path` value.
 84    ```
 85    nested:
 86        path: "first one"
 87    nested/path: "uh oh"
 88    value: "#/nested/path
 89    ```
 90    will produce the following definition:
 91    ```
 92    nested:
 93        path: "first one"
 94    nested/path: "uh oh"
 95    value: "uh oh"
 96    ```
 97
 98    to resolve the ambiguity, we try looking for the reference key at the top level, and then traverse the structs downward
 99    until we find a key with the given path, or until there is nothing to traverse.
100    """
101
102    def preprocess_manifest(self, manifest: Mapping[str, Any]) -> Mapping[str, Any]:
103        """
104        :param manifest: incoming manifest that could have references to previously defined components
105        :return:
106        """
107        return self._evaluate_node(manifest, manifest, set())  # type: ignore[no-any-return]
108
109    def _evaluate_node(self, node: Any, manifest: Mapping[str, Any], visited: Set[Any]) -> Any:
110        if isinstance(node, dict):
111            evaluated_dict = {
112                k: self._evaluate_node(v, manifest, visited)
113                for k, v in node.items()
114                if not self._is_ref_key(k)
115            }
116            if REF_TAG in node:
117                # The node includes a $ref key, so we splat the referenced value(s) into the evaluated dict
118                evaluated_ref = self._evaluate_node(node[REF_TAG], manifest, visited)
119                if not isinstance(evaluated_ref, dict):
120                    return evaluated_ref
121                else:
122                    # The values defined on the component take precedence over the reference values
123                    return evaluated_ref | evaluated_dict
124            else:
125                return evaluated_dict
126        elif isinstance(node, list):
127            return [self._evaluate_node(v, manifest, visited) for v in node]
128        elif self._is_ref(node):
129            if node in visited:
130                raise CircularReferenceException(node)
131            visited.add(node)
132            ret = self._evaluate_node(self._lookup_ref_value(node, manifest), manifest, visited)
133            visited.remove(node)
134            return ret
135        else:
136            return node
137
138    def _lookup_ref_value(self, ref: str, manifest: Mapping[str, Any]) -> Any:
139        ref_match = re.match(r"#/(.*)", ref)
140        if not ref_match:
141            raise ValueError(f"Invalid reference format {ref}")
142        try:
143            path = ref_match.groups()[0]
144            return self._read_ref_value(path, manifest)
145        except (AttributeError, KeyError, IndexError):
146            raise UndefinedReferenceException(path, ref)
147
148    @staticmethod
149    def _is_ref(node: Any) -> bool:
150        return isinstance(node, str) and node.startswith("#/")
151
152    @staticmethod
153    def _is_ref_key(key: str) -> bool:
154        return bool(key == REF_TAG)
155
156    @staticmethod
157    def _read_ref_value(ref: str, manifest_node: Mapping[str, Any]) -> Any:
158        """
159        Read the value at the referenced location of the manifest.
160
161        References are ambiguous because one could define a key containing `/`
162        In this example, we want to refer to the `limit` key in the `dict` object:
163            dict:
164                limit: 50
165            limit_ref: "#/dict/limit"
166
167        Whereas here we want to access the `nested/path` value.
168          nested:
169            path: "first one"
170          nested/path: "uh oh"
171          value: "#/nested/path"
172
173        To resolve the ambiguity, we try looking for the reference key at the top level, and then traverse the structs downward
174        until we find a key with the given path, or until there is nothing to traverse.
175
176        Consider the path foo/bar/baz. To resolve the ambiguity, we first try 'foo/bar/baz' in its entirety as a top-level key. If this
177        fails, we try 'foo' as the top-level key, and if this succeeds, pass 'bar/baz' on as the key to be tried at the next level.
178        """
179        while ref:
180            try:
181                return manifest_node[ref]
182            except (KeyError, TypeError):
183                head, ref = _parse_path(ref)
184                manifest_node = manifest_node[head]  # type: ignore # Couldn't figure out how to fix this since manifest_node can get reassigned into other types like lists
185        return manifest_node
186
187
188def _parse_path(ref: str) -> Tuple[Union[str, int], str]:
189    """
190    Return the next path component, together with the rest of the path.
191
192    A path component may be a string key, or an int index.
193
194    >>> _parse_path("foo/bar")
195    "foo", "bar"
196    >>> _parse_path("foo/7/8/bar")
197    "foo", "7/8/bar"
198    >>> _parse_path("7/8/bar")
199    7, "8/bar"
200    >>> _parse_path("8/bar")
201    8, "bar"
202    >>> _parse_path("8foo/bar")
203    "8foo", "bar"
204    """
205    match = re.match(r"([^/]*)/?(.*)", ref)
206    if match:
207        first, rest = match.groups()
208        try:
209            return int(first), rest
210        except ValueError:
211            return first, rest
212    else:
213        raise ValueError(f"Invalid path {ref} specified")
REF_TAG = '$ref'
class ManifestReferenceResolver:
 17class ManifestReferenceResolver:
 18    """
 19    An incoming manifest can contain references to values previously defined.
 20    This parser will dereference these values to produce a complete ConnectionDefinition.
 21
 22    References can be defined using a #/<arg> string.
 23    ```
 24    key: 1234
 25    reference: "#/key"
 26    ```
 27    will produce the following definition:
 28    ```
 29    key: 1234
 30    reference: 1234
 31    ```
 32    This also works with objects:
 33    ```
 34    key_value_pairs:
 35      k1: v1
 36      k2: v2
 37    same_key_value_pairs: "#/key_value_pairs"
 38    ```
 39    will produce the following definition:
 40    ```
 41    key_value_pairs:
 42      k1: v1
 43      k2: v2
 44    same_key_value_pairs:
 45      k1: v1
 46      k2: v2
 47    ```
 48
 49    The $ref keyword can be used to refer to an object and enhance it with addition key-value pairs
 50    ```
 51    key_value_pairs:
 52      k1: v1
 53      k2: v2
 54    same_key_value_pairs:
 55      $ref: "#/key_value_pairs"
 56      k3: v3
 57    ```
 58    will produce the following definition:
 59    ```
 60    key_value_pairs:
 61      k1: v1
 62      k2: v2
 63    same_key_value_pairs:
 64      k1: v1
 65      k2: v2
 66      k3: v3
 67    ```
 68
 69    References can also point to nested values.
 70    Nested references are ambiguous because one could define a key containing with `.`
 71    in this example, we want to refer to the limit key in the dict object:
 72    ```
 73    dict:
 74        limit: 50
 75    limit_ref: "#/dict/limit"
 76    ```
 77    will produce the following definition:
 78    ```
 79    dict
 80        limit: 50
 81    limit-ref: 50
 82    ```
 83
 84    whereas here we want to access the `nested/path` value.
 85    ```
 86    nested:
 87        path: "first one"
 88    nested/path: "uh oh"
 89    value: "#/nested/path
 90    ```
 91    will produce the following definition:
 92    ```
 93    nested:
 94        path: "first one"
 95    nested/path: "uh oh"
 96    value: "uh oh"
 97    ```
 98
 99    to resolve the ambiguity, we try looking for the reference key at the top level, and then traverse the structs downward
100    until we find a key with the given path, or until there is nothing to traverse.
101    """
102
103    def preprocess_manifest(self, manifest: Mapping[str, Any]) -> Mapping[str, Any]:
104        """
105        :param manifest: incoming manifest that could have references to previously defined components
106        :return:
107        """
108        return self._evaluate_node(manifest, manifest, set())  # type: ignore[no-any-return]
109
110    def _evaluate_node(self, node: Any, manifest: Mapping[str, Any], visited: Set[Any]) -> Any:
111        if isinstance(node, dict):
112            evaluated_dict = {
113                k: self._evaluate_node(v, manifest, visited)
114                for k, v in node.items()
115                if not self._is_ref_key(k)
116            }
117            if REF_TAG in node:
118                # The node includes a $ref key, so we splat the referenced value(s) into the evaluated dict
119                evaluated_ref = self._evaluate_node(node[REF_TAG], manifest, visited)
120                if not isinstance(evaluated_ref, dict):
121                    return evaluated_ref
122                else:
123                    # The values defined on the component take precedence over the reference values
124                    return evaluated_ref | evaluated_dict
125            else:
126                return evaluated_dict
127        elif isinstance(node, list):
128            return [self._evaluate_node(v, manifest, visited) for v in node]
129        elif self._is_ref(node):
130            if node in visited:
131                raise CircularReferenceException(node)
132            visited.add(node)
133            ret = self._evaluate_node(self._lookup_ref_value(node, manifest), manifest, visited)
134            visited.remove(node)
135            return ret
136        else:
137            return node
138
139    def _lookup_ref_value(self, ref: str, manifest: Mapping[str, Any]) -> Any:
140        ref_match = re.match(r"#/(.*)", ref)
141        if not ref_match:
142            raise ValueError(f"Invalid reference format {ref}")
143        try:
144            path = ref_match.groups()[0]
145            return self._read_ref_value(path, manifest)
146        except (AttributeError, KeyError, IndexError):
147            raise UndefinedReferenceException(path, ref)
148
149    @staticmethod
150    def _is_ref(node: Any) -> bool:
151        return isinstance(node, str) and node.startswith("#/")
152
153    @staticmethod
154    def _is_ref_key(key: str) -> bool:
155        return bool(key == REF_TAG)
156
157    @staticmethod
158    def _read_ref_value(ref: str, manifest_node: Mapping[str, Any]) -> Any:
159        """
160        Read the value at the referenced location of the manifest.
161
162        References are ambiguous because one could define a key containing `/`
163        In this example, we want to refer to the `limit` key in the `dict` object:
164            dict:
165                limit: 50
166            limit_ref: "#/dict/limit"
167
168        Whereas here we want to access the `nested/path` value.
169          nested:
170            path: "first one"
171          nested/path: "uh oh"
172          value: "#/nested/path"
173
174        To resolve the ambiguity, we try looking for the reference key at the top level, and then traverse the structs downward
175        until we find a key with the given path, or until there is nothing to traverse.
176
177        Consider the path foo/bar/baz. To resolve the ambiguity, we first try 'foo/bar/baz' in its entirety as a top-level key. If this
178        fails, we try 'foo' as the top-level key, and if this succeeds, pass 'bar/baz' on as the key to be tried at the next level.
179        """
180        while ref:
181            try:
182                return manifest_node[ref]
183            except (KeyError, TypeError):
184                head, ref = _parse_path(ref)
185                manifest_node = manifest_node[head]  # type: ignore # Couldn't figure out how to fix this since manifest_node can get reassigned into other types like lists
186        return manifest_node

An incoming manifest can contain references to values previously defined. This parser will dereference these values to produce a complete ConnectionDefinition.

References can be defined using a #/ string.

key: 1234
reference: "#/key"

will produce the following definition:

key: 1234
reference: 1234

This also works with objects:

key_value_pairs:
  k1: v1
  k2: v2
same_key_value_pairs: "#/key_value_pairs"

will produce the following definition:

key_value_pairs:
  k1: v1
  k2: v2
same_key_value_pairs:
  k1: v1
  k2: v2

The $ref keyword can be used to refer to an object and enhance it with addition key-value pairs

key_value_pairs:
  k1: v1
  k2: v2
same_key_value_pairs:
  $ref: "#/key_value_pairs"
  k3: v3

will produce the following definition:

key_value_pairs:
  k1: v1
  k2: v2
same_key_value_pairs:
  k1: v1
  k2: v2
  k3: v3

References can also point to nested values. Nested references are ambiguous because one could define a key containing with . in this example, we want to refer to the limit key in the dict object:

dict:
    limit: 50
limit_ref: "#/dict/limit"

will produce the following definition:

dict
    limit: 50
limit-ref: 50

whereas here we want to access the nested/path value.

nested:
    path: "first one"
nested/path: "uh oh"
value: "#/nested/path

will produce the following definition:

nested:
    path: "first one"
nested/path: "uh oh"
value: "uh oh"

to resolve the ambiguity, we try looking for the reference key at the top level, and then traverse the structs downward until we find a key with the given path, or until there is nothing to traverse.

def preprocess_manifest(self, manifest: Mapping[str, Any]) -> Mapping[str, Any]:
103    def preprocess_manifest(self, manifest: Mapping[str, Any]) -> Mapping[str, Any]:
104        """
105        :param manifest: incoming manifest that could have references to previously defined components
106        :return:
107        """
108        return self._evaluate_node(manifest, manifest, set())  # type: ignore[no-any-return]
Parameters
  • manifest: incoming manifest that could have references to previously defined components
Returns