docker-airflow
docker-airflow copied to clipboard
Connection keeps dropping. Why?
Hi, I am running 3 containers on kubernetes.
- Postgresql
- Airflow scheduler with gitsync sidecar
- Airflow webserver with gisync sidecar.
Everything works like a charm - for about 30seconds to 3 minutes. Then I always get the same error:
Process DagFileProcessor2880-Process:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/base.py", line 2345, in _wrap_pool_connect
return fn()
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 364, in connect
return _ConnectionFairy._checkout(self)
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 778, in _checkout
fairy = _ConnectionRecord.checkout(pool)
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 495, in checkout
rec = pool._do_get()
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/impl.py", line 140, in _do_get
self._dec_overflow()
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/util/langhelpers.py", line 69, in __exit__
exc_value, with_traceback=exc_tb,
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/util/compat.py", line 178, in raise_
raise exception
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/impl.py", line 137, in _do_get
return self._create_connection()
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 309, in _create_connection
return _ConnectionRecord(self)
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 440, in __init__
self.__connect(first_connect_check=True)
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 661, in __connect
pool.logger.debug("Error on connect(): %s", e)
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/util/langhelpers.py", line 69, in __exit__
exc_value, with_traceback=exc_tb,
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/util/compat.py", line 178, in raise_
raise exception
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 656, in __connect
connection = pool._invoke_creator(self)
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/strategies.py", line 114, in connect
return dialect.connect(*cargs, **cparams)
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/default.py", line 490, in connect
return self.dbapi.connect(*cargs, **cparams)
File "/usr/local/lib/python3.6/site-packages/psycopg2/__init__.py", line 127, in connect
conn = _connect(dsn, connection_factory=connection_factory, **kwasync)
psycopg2.OperationalError: could not connect to server: Connection refused
Is the server running on host "airflow-main-service" (172.30.14.145) and accepting
TCP/IP connections on port 5434?
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/usr/local/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
self.run()
File "/usr/local/lib/python3.6/multiprocessing/process.py", line 93, in run
self._target(*self._args, **self._kwargs)
File "/usr/local/lib/python3.6/site-packages/airflow/jobs/scheduler_job.py", line 158, in _run_file_processor
pickle_dags)
File "/usr/local/lib/python3.6/site-packages/airflow/utils/db.py", line 74, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.6/site-packages/airflow/jobs/scheduler_job.py", line 1582, in process_file
dag.sync_to_db()
File "/usr/local/lib/python3.6/site-packages/airflow/utils/db.py", line 74, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.6/site-packages/airflow/models/dag.py", line 1496, in sync_to_db
DagModel).filter(DagModel.dag_id == self.dag_id).first()
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/orm/query.py", line 3375, in first
ret = list(self[0:1])
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/orm/query.py", line 3149, in __getitem__
return list(res)
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/orm/query.py", line 3481, in __iter__
return self._execute_and_instances(context)
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/orm/query.py", line 3503, in _execute_and_instances
querycontext, self._connection_from_session, close_with_result=True
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/orm/query.py", line 3518, in _get_bind_args
mapper=self._bind_mapper(), clause=querycontext.statement, **kw
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/orm/query.py", line 3496, in _connection_from_session
conn = self.session.connection(**kw)
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/orm/session.py", line 1141, in connection
execution_options=execution_options,
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/orm/session.py", line 1147, in _connection_for_bind
engine, execution_options
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/orm/session.py", line 433, in _connection_for_bind
conn = bind._contextual_connect()
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/base.py", line 2311, in _contextual_connect
self._wrap_pool_connect(self.pool.connect, None),
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/base.py", line 2349, in _wrap_pool_connect
e, dialect, self
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/base.py", line 1591, in _handle_dbapi_exception_noconnection
sqlalchemy_exception, with_traceback=exc_info[2], from_=e
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/util/compat.py", line 178, in raise_
raise exception
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/base.py", line 2345, in _wrap_pool_connect
return fn()
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 364, in connect
return _ConnectionFairy._checkout(self)
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 778, in _checkout
fairy = _ConnectionRecord.checkout(pool)
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 495, in checkout
rec = pool._do_get()
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/impl.py", line 140, in _do_get
self._dec_overflow()
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/util/langhelpers.py", line 69, in __exit__
exc_value, with_traceback=exc_tb,
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/util/compat.py", line 178, in raise_
raise exception
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/impl.py", line 137, in _do_get
return self._create_connection()
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 309, in _create_connection
return _ConnectionRecord(self)
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 440, in __init__
self.__connect(first_connect_check=True)
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 661, in __connect
pool.logger.debug("Error on connect(): %s", e)
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/util/langhelpers.py", line 69, in __exit__
exc_value, with_traceback=exc_tb,
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/util/compat.py", line 178, in raise_
raise exception
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 656, in __connect
connection = pool._invoke_creator(self)
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/strategies.py", line 114, in connect
return dialect.connect(*cargs, **cparams)
File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/default.py", line 490, in connect
return self.dbapi.connect(*cargs, **cparams)
File "/usr/local/lib/python3.6/site-packages/psycopg2/__init__.py", line 127, in connect
conn = _connect(dsn, connection_factory=connection_factory, **kwasync)
sqlalchemy.exc.OperationalError: (psycopg2.OperationalError) could not connect to server: Connection refused
Is the server running on host "airflow-main-service" (172.30.14.145) and accepting
TCP/IP connections on port 5434?
Same error on both scheduler/webserver. What could the problem be? I checked the postgres contaienr, no issue there. I used a portforward on connected to postgres for a few hours - never had any hangup/disconnect. I opened a shell in the scheduler container and connected to postgres container by running a quick python interactive session
(I forget the exact details)
import time
import sqlalchemy
db_string = .....
while True:
Sqlalchem.... get all tables .... fetchall() --> Always printed out tables.
time.sleep(30)
and let this run for 30 minutes - no problems.
But both airflow services keep losing their connection.
Does the database run in the container "airflow-main-service"?
Issue solved?
Hi,
I have also observed the same issue in my setup recently. I am runnning Airflow 10.1.4 with MariaDB as database and few of the task gets failed with following error
[2020-09-20 14:46:09,719] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip /usr/local/lib/python3.6/site-packages/airflow/config_templates/airflow_local_settings.py:65: DeprecationWarning: The elasticsearch_host option in [elasticsearch] has been renamed to host - the old setting has been used, but please update your config.
[2020-09-20 14:46:09,719] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip ELASTICSEARCH_HOST = conf.get('elasticsearch', 'HOST')
[2020-09-20 14:46:09,719] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip /usr/local/lib/python3.6/site-packages/airflow/config_templates/airflow_local_settings.py:67: DeprecationWarning: The elasticsearch_log_id_template option in [elasticsearch] has been renamed to log_id_template - the old setting has been used, but please update your config.
[2020-09-20 14:46:09,719] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip ELASTICSEARCH_LOG_ID_TEMPLATE = conf.get('elasticsearch', 'LOG_ID_TEMPLATE')
[2020-09-20 14:46:09,719] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip /usr/local/lib/python3.6/site-packages/airflow/config_templates/airflow_local_settings.py:69: DeprecationWarning: The elasticsearch_end_of_log_mark option in [elasticsearch] has been renamed to end_of_log_mark - the old setting has been used, but please update your config.
[2020-09-20 14:46:09,719] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip ELASTICSEARCH_END_OF_LOG_MARK = conf.get('elasticsearch', 'END_OF_LOG_MARK')
[2020-09-20 14:46:10,119] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip [2020-09-20 14:46:10,119] {settings.py:213} INFO - settings.configure_orm(): Using pool settings. pool_size=50, max_overflow=10, pool_recycle=5400, pid=94919
[2020-09-20 14:46:15,218] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip [2020-09-20 14:46:15,217] {init.py:51} INFO - Using executor LocalExecutor
[2020-09-20 14:46:16,627] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip [2020-09-20 14:46:16,626] {dagbag.py:90} INFO - Filling up the DagBag from /home/orbdviz/datavisualization/workspace/dags/nc_pipe.py
[2020-09-20 14:48:24,891] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip Traceback (most recent call last):
[2020-09-20 14:48:24,891] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/base.py", line 2285, in _wrap_pool_connect
[2020-09-20 14:48:24,892] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip return fn()
[2020-09-20 14:48:24,892] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 363, in connect
[2020-09-20 14:48:24,892] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip return _ConnectionFairy._checkout(self)
[2020-09-20 14:48:24,892] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 773, in _checkout
[2020-09-20 14:48:24,892] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip fairy = _ConnectionRecord.checkout(pool)
[2020-09-20 14:48:24,892] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 492, in checkout
[2020-09-20 14:48:24,892] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip rec = pool._do_get()
[2020-09-20 14:48:24,892] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/impl.py", line 238, in _do_get
[2020-09-20 14:48:24,892] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip return self._create_connection()
[2020-09-20 14:48:24,892] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 308, in _create_connection
[2020-09-20 14:48:24,892] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip return _ConnectionRecord(self)
[2020-09-20 14:48:24,892] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 437, in init
[2020-09-20 14:48:24,892] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip self.__connect(first_connect_check=True)
[2020-09-20 14:48:24,892] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 657, in _connect
[2020-09-20 14:48:24,892] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip pool.logger.debug("Error on connect(): %s", e)
[2020-09-20 14:48:24,893] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip File "/usr/local/lib/python3.6/site-packages/sqlalchemy/util/langhelpers.py", line 69, in exit
[2020-09-20 14:48:24,893] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip exc_value, with_traceback=exc_tb,
[2020-09-20 14:48:24,893] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip File "/usr/local/lib/python3.6/site-packages/sqlalchemy/util/compat.py", line 178, in raise
[2020-09-20 14:48:24,893] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip raise exception
[2020-09-20 14:48:24,893] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip File "/usr/local/lib/python3.6/site-packages/sqlalchemy/pool/base.py", line 652, in __connect
[2020-09-20 14:48:24,893] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip connection = pool._invoke_creator(self)
[2020-09-20 14:48:24,893] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/strategies.py", line 114, in connect
[2020-09-20 14:48:24,893] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip return dialect.connect(*cargs, **cparams)
[2020-09-20 14:48:24,893] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip File "/usr/local/lib/python3.6/site-packages/sqlalchemy/engine/default.py", line 488, in connect
[2020-09-20 14:48:24,893] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip return self.dbapi.connect(*cargs, **cparams)
[2020-09-20 14:48:24,893] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip File "/usr/local/lib/python3.6/site-packages/MySQLdb/init.py", line 85, in Connect
[2020-09-20 14:48:24,893] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip return Connection(*args, **kwargs)
[2020-09-20 14:48:24,893] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip File "/usr/local/lib/python3.6/site-packages/MySQLdb/connections.py", line 208, in init
[2020-09-20 14:48:24,893] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip super(Connection, self).init(*args, **kwargs2)
[2020-09-20 14:48:24,893] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip _mysql_exceptions.OperationalError: (2006, "Can't connect to MySQL server on 'ctc2hz1-02-s40.uhc.com' (115)")
[2020-09-20 14:48:24,893] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip
[2020-09-20 14:48:24,893] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip The above exception was the direct cause of the following exception:
[2020-09-20 14:48:24,893] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip
[2020-09-20 14:48:24,894] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip Traceback (most recent call last):
[2020-09-20 14:48:24,894] {base_task_runner.py:115} INFO - Job 2087: Subtask load_raw_tables_ip File "/usr/local/bin/airflow", line 32, in
Strange part is I checked in my database to see if there is any max connection issue but the number of connection is also very low that airflow has opened on database. When re run the jobs, it start working fine after 2-3 retries. Could you please guide how we should proceed on this.
Thanks