New DALFormatError for a DP0.3 query

On during the Delegate Assembly on Fri Sep 29, all attendees who attempted to execute this query (which had worked on Thu Sep 28, pre-patch window) ran into the same error. This happened with small, medium, and large containers.

RSP staff are already working on this issue, which was reported internally during the assembly. But I wanted to post it here so that we can also post a solution later.

To recreate the issue, use the ssotap service to execute the ADQL query below.

from lsst.rsp import get_tap_service
service = get_tap_service("ssotap")
allobj = service.search("SELECT mpc.ssObjectId, mpc.e, mpc.incl, mpc.q, "
                      "mpc.node, mpc.peri, sso.* "
                      "FROM dp03_catalogs_10yr.MPCORB as mpc "
                      "JOIN dp03_catalogs_10yr.SSObject as sso "
                      "ON mpc.ssObjectId = sso.ssObjectId "
                      f"WHERE sso.flags < 2048 ").to_table()
The resulting error message
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/http/client.py:566, in HTTPResponse._get_chunk_left(self)
    565 try:
--> 566     chunk_left = self._read_next_chunk_size()
    567 except ValueError:
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/http/client.py:533, in HTTPResponse._read_next_chunk_size(self)
    532 try:
--> 533     return int(line, 16)
    534 except ValueError:
    535     # close the connection as protocol synchronisation is
    536     # probably lost
ValueError: invalid literal for int() with base 16: b''
During handling of the above exception, another exception occurred:
IncompleteRead                            Traceback (most recent call last)
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/http/client.py:583, in HTTPResponse._read_chunked(self, amt)
    582 while True:
--> 583     chunk_left = self._get_chunk_left()
    584     if chunk_left is None:
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/http/client.py:568, in HTTPResponse._get_chunk_left(self)
    567 except ValueError:
--> 568     raise IncompleteRead(b'')
    569 if chunk_left == 0:
    570     # last chunk: 1*("0") [ chunk-extension ] CRLF
IncompleteRead: IncompleteRead(0 bytes read)
The above exception was the direct cause of the following exception:
IncompleteRead                            Traceback (most recent call last)
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/site-packages/urllib3/response.py:444, in HTTPResponse._error_catcher(self)
    443 try:
--> 444     yield
    446 except SocketTimeout:
    447     # FIXME: Ideally we'd like to include the url in the ReadTimeoutError but
    448     # there is yet no clean way to get at it from this context.
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/site-packages/urllib3/response.py:567, in HTTPResponse.read(self, amt, decode_content, cache_content)
    566 with self._error_catcher():
--> 567     data = self._fp_read(amt) if not fp_closed else b""
    568     if amt is None:
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/site-packages/urllib3/response.py:533, in HTTPResponse._fp_read(self, amt)
    531 else:
    532     # StringIO doesn't like amt=None
--> 533     return self._fp.read(amt) if amt is not None else self._fp.read()
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/http/client.py:460, in HTTPResponse.read(self, amt)
    459 if self.chunked:
--> 460     return self._read_chunked(amt)
    462 if amt is not None:
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/http/client.py:598, in HTTPResponse._read_chunked(self, amt)
    597 except IncompleteRead as exc:
--> 598     raise IncompleteRead(b''.join(value)) from exc
IncompleteRead: IncompleteRead(3422 bytes read)
During handling of the above exception, another exception occurred:
ProtocolError                             Traceback (most recent call last)
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/site-packages/pyvo/dal/query.py:241, in DALQuery.execute_votable(self, post)
    240 try:
--> 241     return votableparse(self.execute_stream(post=post).read)
    242 except Exception as e:
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/site-packages/astropy/utils/decorators.py:604, in deprecated_renamed_argument.<locals>.decorator.<locals>.wrapper(*args, **kwargs)
    602         warnings.warn(msg, warning_type, stacklevel=2)
--> 604 return function(*args, **kwargs)
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/site-packages/astropy/io/votable/table.py:177, in parse(source, columns, invalid, verify, chunk_size, table_number, table_id, filename, unit_format, datatype_mapping, _debug_python_based_parser)
    174 with iterparser.get_xml_iterator(
    175     source, _debug_python_based_parser=_debug_python_based_parser
    176 ) as iterator:
--> 177     return tree.VOTableFile(config=config, pos=(1, 1)).parse(iterator, config)
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/site-packages/astropy/io/votable/tree.py:3900, in VOTableFile.parse(self, iterator, config)
   3899 if start:
-> 3900     tag_mapping.get(tag, self._add_unknown_tag)(
   3901         iterator, tag, data, config, pos
   3902     )
   3903 elif tag == "DESCRIPTION":
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/site-packages/astropy/io/votable/tree.py:3780, in VOTableFile._add_resource(self, iterator, tag, data, config, pos)
   3779 self.resources.append(resource)
-> 3780 resource.parse(self, iterator, config)
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/site-packages/astropy/io/votable/tree.py:3578, in Resource.parse(self, votable, iterator, config)
   3577 if start:
-> 3578     tag_mapping.get(tag, self._add_unknown_tag)(
   3579         iterator, tag, data, config, pos
   3580     )
   3581 elif tag == "DESCRIPTION":
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/site-packages/astropy/io/votable/tree.py:3524, in Resource._add_table(self, iterator, tag, data, config, pos)
   3523 self.tables.append(table)
-> 3524 table.parse(iterator, config)
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/site-packages/astropy/io/votable/tree.py:2744, in Table.parse(self, iterator, config)
   2743 warn_unknown_attrs("TABLEDATA", data.keys(), config, pos)
-> 2744 self.array = self._parse_tabledata(
   2745     iterator, colnumbers, fields, config
   2746 )
   2747 break
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/site-packages/astropy/io/votable/tree.py:2820, in Table._parse_tabledata(self, iterator, colnumbers, fields, config)
   2819 i = 0
-> 2820 for start, tag, data, pos in iterator:
   2821     if start:
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/site-packages/urllib3/response.py:566, in HTTPResponse.read(self, amt, decode_content, cache_content)
    564 fp_closed = getattr(self._fp, "closed", False)
--> 566 with self._error_catcher():
    567     data = self._fp_read(amt) if not fp_closed else b""
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/contextlib.py:155, in _GeneratorContextManager.__exit__(self, typ, value, traceback)
    154 try:
--> 155     self.gen.throw(typ, value, traceback)
    156 except StopIteration as exc:
    157     # Suppress StopIteration *unless* it's the same exception that
    158     # was passed to throw().  This prevents a StopIteration
    159     # raised inside the "with" statement from being suppressed.
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/site-packages/urllib3/response.py:461, in HTTPResponse._error_catcher(self)
    459 except (HTTPException, SocketError) as e:
    460     # This includes IncompleteRead.
--> 461     raise ProtocolError("Connection broken: %r" % e, e)
    463 # If no exception is thrown, we should avoid cleaning up
    464 # unnecessarily.
ProtocolError: ('Connection broken: IncompleteRead(3422 bytes read)', IncompleteRead(3422 bytes read))
During handling of the above exception, another exception occurred:
DALFormatError                            Traceback (most recent call last)
Cell In[3], line 2
      1 service = get_tap_service("ssotap")
----> 2 allobj = service.search("SELECT mpc.ssObjectId, mpc.e, mpc.incl, mpc.q, "
      3                       "mpc.node, mpc.peri, sso.* "
      4                       "FROM dp03_catalogs_10yr.MPCORB as mpc "
      5                       "JOIN dp03_catalogs_10yr.SSObject as sso "
      6                       "ON mpc.ssObjectId = sso.ssObjectId "
      7                       "WHERE sso.flags < 2048 ").to_table()
      8 print(len(allobj))
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/site-packages/pyvo/dal/tap.py:257, in TAPService.run_sync(self, query, language, maxrec, uploads, **keywords)
    228 def run_sync(
    229         self, query, language="ADQL", maxrec=None, uploads=None,
    230         **keywords):
    231     """
    232     runs sync query and returns its result
    233 
   (...)
    253     TAPResults
    254     """
    255     return self.create_query(
    256         query, language=language, maxrec=maxrec, uploads=uploads,
--> 257         **keywords).execute()
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/site-packages/pyvo/dal/tap.py:1076, in TAPQuery.execute(self)
   1062 def execute(self):
   1063     """
   1064     submit the query and return the results as a TAPResults instance
   1065 
   (...)
   1074        for errors parsing the VOTable response
   1075     """
-> 1076     return TAPResults(self.execute_votable(), url=self.queryurl, session=self._session)
File /opt/lsst/software/stack/conda/miniconda3-py38_4.9.2/envs/lsst-scipipe-7.0.1/lib/python3.11/site-packages/pyvo/dal/query.py:244, in DALQuery.execute_votable(self, post)
    242 except Exception as e:
    243     self.raise_if_error()
--> 244     raise DALFormatError(e, self.queryurl)
DALFormatError: ProtocolError: ('Connection broken: IncompleteRead(3422 bytes read)', IncompleteRead(3422 bytes read))

The direct cause of the failure was that the TAP server serving the DP0.3 data set ran out of memory and crashed.

This was partially due to a Kubernetes misconfiguration that made the service think it had more available memory than it actually did, so it did not attempt to free resources as aggressively as it could. That misconfiguration has now been fixed. However, this implies that the above query (possibly only when used by multiple people at the same time?) is very resource-intensive on the TAP server, and it’s possible that the service would not have remained stable even without the misconfiguration.

1 Like