datajoint-python
datajoint-python copied to clipboard
Unexpected OperationalError when deleting entries with 0.13.1
Bug Report
Description
When deleting entries with a relatively complicated restriction, an unexpected OperationalError occurs using DJ 0.13.1. This was not seen in 0.12.9.
Reproducibility
Include:
- OS: Linux
- Python Version: 3.8
- MySQL Version: 5.7.21
- MySQL Deployment Strategy: remote
- DataJoint Version: 0.13.1
- Minimum number of steps to reliably reproduce the issue: did not get time to clean up for a simple replication. I could do it later or provide any info needed inside my working environment. Here is a code snippet inside the working environment that caused the error:
latest = subject.Subject.aggr(
behavior_plotting.LatestDate,
checking_ts='MAX(checking_ts)') * behavior_plotting.LatestDate & \
['latest_date between curdate() - interval 30 day and curdate()',
(subject.Subject - subject.Death)] & \
(subject.Subject & 'subject_nickname not like "%human%"').proj()
subj_keys = (subject.Subject & behavior_plotting.CumulativeSummary & latest).fetch('KEY')
for subj_key in subj_keys:
(behavior_plotting.CumulativeSummary & subj_key & latest).delete()
- Complete error stack as a result of evaluating the above steps:
---------------------------------------------------------------------------
OperationalError Traceback (most recent call last)
/src/IBL-pipeline/scripts/ingest_increment.py in <module>
1 from ibl_pipeline.process import autoprocess, get_timezone, process_histology, process_qc, populate_wheel
2
----> 3 autoprocess.process_new(timezone=get_timezone())
4 process_histology.main()
5 process_qc.main()
/src/IBL-pipeline/ibl_pipeline/process/autoprocess.py in process_new(previous_dump, latest_dump, job_date, timezone)
101 print('Ingesting behavior...')
102 start = datetime.datetime.now()
--> 103 populate_behavior.main(backtrack_days=30)
104 ingest_status(job_key, 'Populate behavior', start,
105 end=datetime.datetime.now())
/src/IBL-pipeline/ibl_pipeline/process/populate_behavior.py in main(backtrack_days, excluded_tables)
141 dj.config['safemode'] = False
142 for subj_key in tqdm(subj_keys, position=0):
--> 143 (behavior_plotting.CumulativeSummary & subj_key & latest).delete()
144 behavior_plotting.CumulativeSummary.populate(
145 latest & subj_key, suppress_errors=True)
/usr/local/lib/python3.8/dist-packages/datajoint/table.py in delete(self, transaction, safemode)
417 # Cascading delete
418 try:
--> 419 delete_count = self._delete_cascade()
420 except:
421 if transaction:
/usr/local/lib/python3.8/dist-packages/datajoint/table.py in _delete_cascade(self)
355 for _ in range(max_attempts):
356 try:
--> 357 delete_count += self.delete_quick(get_count=True)
358 except IntegrityError as error:
359 match = foreign_key_error_regexp.match(error.args[0]).groupdict()
/usr/local/lib/python3.8/dist-packages/datajoint/table.py in delete_quick(self, get_count)
344 """
345 query = 'DELETE FROM ' + self.full_table_name + self.where_clause()
--> 346 self.connection.query(query)
347 count = self.connection.query("SELECT ROW_COUNT()").fetchone()[0] if get_count else None
348 self._log(query[:255])
/usr/local/lib/python3.8/dist-packages/datajoint/connection.py in query(self, query, args, as_dict, suppress_warnings, reconnect)
300 cursor = self._conn.cursor(cursor=cursor_class)
301 try:
--> 302 self._execute_query(cursor, query, args, suppress_warnings)
303 except errors.LostConnectionError:
304 if not reconnect:
/usr/local/lib/python3.8/dist-packages/datajoint/connection.py in _execute_query(cursor, query, args, suppress_warnings)
266 cursor.execute(query, args)
267 except client.err.Error as err:
--> 268 raise translate_query_error(err, query)
269
270 def query(self, query, args=(), *, as_dict=False, suppress_warnings=True, reconnect=None):
/usr/local/lib/python3.8/dist-packages/datajoint/connection.py in _execute_query(cursor, query, args, suppress_warnings)
264 # suppress all warnings arising from underlying SQL library
265 warnings.simplefilter("ignore")
--> 266 cursor.execute(query, args)
267 except client.err.Error as err:
268 raise translate_query_error(err, query)
/usr/local/lib/python3.8/dist-packages/pymysql/cursors.py in execute(self, query, args)
146 query = self.mogrify(query, args)
147
--> 148 result = self._query(query)
149 self._executed = query
150 return result
/usr/local/lib/python3.8/dist-packages/pymysql/cursors.py in _query(self, q)
308 self._last_executed = q
309 self._clear_result()
--> 310 conn.query(q)
311 self._do_get_result()
312 return self.rowcount
/usr/local/lib/python3.8/dist-packages/pymysql/connections.py in query(self, sql, unbuffered)
546 sql = sql.encode(self.encoding, "surrogateescape")
547 self._execute_command(COMMAND.COM_QUERY, sql)
--> 548 self._affected_rows = self._read_query_result(unbuffered=unbuffered)
549 return self._affected_rows
550
/usr/local/lib/python3.8/dist-packages/pymysql/connections.py in _read_query_result(self, unbuffered)
773 else:
774 result = MySQLResult(self)
--> 775 result.read()
776 self._result = result
777 if result.server_status is not None:
/usr/local/lib/python3.8/dist-packages/pymysql/connections.py in read(self)
1154 def read(self):
1155 try:
-> 1156 first_packet = self.connection._read_packet()
1157
1158 if first_packet.is_ok_packet():
/usr/local/lib/python3.8/dist-packages/pymysql/connections.py in _read_packet(self, packet_type)
723 if self._result is not None and self._result.unbuffered_active is True:
724 self._result.unbuffered_active = False
--> 725 packet.raise_for_error()
726 return packet
727
/usr/local/lib/python3.8/dist-packages/pymysql/protocol.py in raise_for_error(self)
219 if DEBUG:
220 print("errno =", errno)
--> 221 err.raise_mysql_exception(self._data)
222
223 def dump(self):
/usr/local/lib/python3.8/dist-packages/pymysql/err.py in raise_mysql_exception(data)
141 if errorclass is None:
142 errorclass = InternalError if errno < 1000 else OperationalError
--> 143 raise errorclass(errno, errval)
OperationalError: (1104, 'The SELECT would examine more than MAX_JOIN_SIZE rows; check your WHERE and use SET SQL_BIG_SELECTS=1 or SET MAX_JOIN_SIZE=# if the SELECT is okay')
Expected Behavior
A clear and concise description of what you expected to happen.
OperationalError Traceback (most recent call last)
/src/IBL-pipeline/scripts/ingest_increment.py in
/src/IBL-pipeline/ibl_pipeline/process/autoprocess.py in process_new(previous_dump, latest_dump, job_date, timezone) 101 print('Ingesting behavior...') 102 start = datetime.datetime.now() --> 103 populate_behavior.main(backtrack_days=30) 104 ingest_status(job_key, 'Populate behavior', start, 105 end=datetime.datetime.now())
/src/IBL-pipeline/ibl_pipeline/process/populate_behavior.py in main(backtrack_days, excluded_tables) 141 dj.config['safemode'] = False 142 for subj_key in tqdm(subj_keys, position=0): --> 143 (behavior_plotting.CumulativeSummary & subj_key & latest).delete() 144 behavior_plotting.CumulativeSummary.populate( 145 latest & subj_key, suppress_errors=True)
/usr/local/lib/python3.8/dist-packages/datajoint/table.py in delete(self, transaction, safemode) 417 # Cascading delete 418 try: --> 419 delete_count = self._delete_cascade() 420 except: 421 if transaction:
/usr/local/lib/python3.8/dist-packages/datajoint/table.py in _delete_cascade(self) 355 for _ in range(max_attempts): 356 try: --> 357 delete_count += self.delete_quick(get_count=True) 358 except IntegrityError as error: 359 match = foreign_key_error_regexp.match(error.args[0]).groupdict()
/usr/local/lib/python3.8/dist-packages/datajoint/table.py in delete_quick(self, get_count) 344 """ 345 query = 'DELETE FROM ' + self.full_table_name + self.where_clause() --> 346 self.connection.query(query) 347 count = self.connection.query("SELECT ROW_COUNT()").fetchone()[0] if get_count else None 348 self._log(query[:255])
/usr/local/lib/python3.8/dist-packages/datajoint/connection.py in query(self, query, args, as_dict, suppress_warnings, reconnect) 300 cursor = self._conn.cursor(cursor=cursor_class) 301 try: --> 302 self._execute_query(cursor, query, args, suppress_warnings) 303 except errors.LostConnectionError: 304 if not reconnect:
/usr/local/lib/python3.8/dist-packages/datajoint/connection.py in _execute_query(cursor, query, args, suppress_warnings) 266 cursor.execute(query, args) 267 except client.err.Error as err: --> 268 raise translate_query_error(err, query) 269 270 def query(self, query, args=(), *, as_dict=False, suppress_warnings=True, reconnect=None):
/usr/local/lib/python3.8/dist-packages/datajoint/connection.py in _execute_query(cursor, query, args, suppress_warnings) 264 # suppress all warnings arising from underlying SQL library 265 warnings.simplefilter("ignore") --> 266 cursor.execute(query, args) 267 except client.err.Error as err: 268 raise translate_query_error(err, query)
/usr/local/lib/python3.8/dist-packages/pymysql/cursors.py in execute(self, query, args) 146 query = self.mogrify(query, args) 147 --> 148 result = self._query(query) 149 self._executed = query 150 return result
/usr/local/lib/python3.8/dist-packages/pymysql/cursors.py in _query(self, q) 308 self._last_executed = q 309 self._clear_result() --> 310 conn.query(q) 311 self._do_get_result() 312 return self.rowcount
/usr/local/lib/python3.8/dist-packages/pymysql/connections.py in query(self, sql, unbuffered) 546 sql = sql.encode(self.encoding, "surrogateescape") 547 self._execute_command(COMMAND.COM_QUERY, sql) --> 548 self._affected_rows = self._read_query_result(unbuffered=unbuffered) 549 return self._affected_rows 550
/usr/local/lib/python3.8/dist-packages/pymysql/connections.py in _read_query_result(self, unbuffered) 773 else: 774 result = MySQLResult(self) --> 775 result.read() 776 self._result = result 777 if result.server_status is not None:
/usr/local/lib/python3.8/dist-packages/pymysql/connections.py in read(self) 1154 def read(self): 1155 try: -> 1156 first_packet = self.connection._read_packet() 1157 1158 if first_packet.is_ok_packet():
/usr/local/lib/python3.8/dist-packages/pymysql/connections.py in _read_packet(self, packet_type) 723 if self._result is not None and self._result.unbuffered_active is True: 724 self._result.unbuffered_active = False --> 725 packet.raise_for_error() 726 return packet 727
/usr/local/lib/python3.8/dist-packages/pymysql/protocol.py in raise_for_error(self) 219 if DEBUG: 220 print("errno =", errno) --> 221 err.raise_mysql_exception(self._data) 222 223 def dump(self):
/usr/local/lib/python3.8/dist-packages/pymysql/err.py in raise_mysql_exception(data) 141 if errorclass is None: 142 errorclass = InternalError if errno < 1000 else OperationalError --> 143 raise errorclass(errno, errval)
OperationalError: (1104, 'The SELECT would examine more than MAX_JOIN_SIZE rows; check your WHERE and use SET SQL_BIG_SELECTS=1 or SET MAX_JOIN_SIZE=# if the SELECT is okay')
Did you mean to leave latest
in the restriction of the set to be deleted? You already applied it to get subj_keys
.
deletes with complex subqueries can get tricky.
@shenshan I agree with @dimitri-yatsenko. It seems like the additional & latest
is not necessary. Can you verify behavior w/o in both 0.12.9
and 0.13.2
? Also, could you call .make_sql()
on the queries in both 0.12.9
and 0.13.2
and share the result here? Will need to review the queries to understand why the join and group by resolve to so many records in the latest version.
Hi @guzman-raphael, thanks for looking into this. & latest
is necessary.
latest
output looks like this:
and
behavior.CumulativeSummary
looks like this:
I do need the field
latest_date
from latest
to restrict behavior.CumulativeSummary
.
Here are the .make_sql()
results:
For 0.12.9:
In [11]: (behavior_plotting.CumulativeSummary & subj_key & latest).make_sql()
Out[11]: 'SELECT * FROM `ibl_plotting_behavior`.`__cumulative_summary` WHERE ((`subject_uuid`=X\'06e159d76425469b9d077efcd3179aca\')) AND ((`subject_uuid`,`latest_date`) in (SELECT `subject_uuid`,`latest_date` FROM (SELECT `subject_uuid`,MAX(checking_ts) as `checking_ts` FROM `ibl_subject`.`subject` NATURAL JOIN `ibl_plotting_behavior`.`latest_date` GROUP BY `subject_uuid`) as `_s20` NATURAL JOIN `ibl_plotting_behavior`.`latest_date` WHERE ((latest_date between curdate() - interval 30 day and curdate() OR (`subject_uuid`) in (SELECT `subject_uuid` FROM `ibl_subject`.`subject` WHERE ((`subject_uuid`) not in (SELECT `subject_uuid` FROM `ibl_subject`.`death`))))) AND ((`subject_uuid`) in (SELECT `subject_uuid` FROM (SELECT * FROM `ibl_subject`.`subject` WHERE (subject_nickname not like "%%human%%")) as `_s21`))))'
For 0.13.2
In [7]: (behavior_plotting.CumulativeSummary & subj_key & latest).make_sql()
Out[7]: 'SELECT `subject_uuid`,`latest_date`,`cumulative_summary_ts` FROM `ibl_plotting_behavior`.`__cumulative_summary` WHERE((`subject_uuid`=X\'06e159d76425469b9d077efcd3179aca\'))AND((`subject_uuid`,`latest_date`) in (SELECT `subject_uuid`,`latest_date` FROM (SELECT `subject_uuid`,MAX(checking_ts) as `checking_ts` FROM `ibl_subject`.`subject` NATURAL JOIN `ibl_plotting_behavior`.`latest_date` GROUP BY `subject_uuid`) as `$2` NATURAL JOIN (SELECT `subject_uuid`,`checking_ts`,`latest_date` FROM `ibl_plotting_behavior`.`latest_date`) as `$3` WHERE((latest_date between curdate() - interval 30 day and curdate() OR (`subject_uuid`) in (SELECT `subject_uuid` FROM `ibl_subject`.`subject` WHERE((`subject_uuid`) not in (SELECT `subject_uuid` FROM `ibl_subject`.`death`)))))AND((`subject_uuid`) in (SELECT DISTINCT `subject_uuid` FROM `ibl_subject`.`subject` WHERE(subject_nickname not like "%%human%%")))))'