Flaky Tests
Please comment or +1 list your flaky tests. Don't just say "gpu tests", name specifics failing
unittests_gpu17 on "Install opencv, vqa-maskrcnn-benchmark"
Often fails with fatal: unable to access 'https://gitlab.com/vedanuj/vqa-maskrcnn-benchmark.git/': The requested URL returned error: 502 and succeeds when re-running.
Example:
https://app.circleci.com/pipelines/github/facebookresearch/ParlAI/8984/workflows/f00f8441-16ad-42fc-8d87-f2a0b297b29c/jobs/73296
self = <tests.test_transformers.TestTransformerGenerator testMethod=test_beamdelay>
def test_beamdelay(self):
"""
Test delayedbeam generation.
"""
# Delayed Beam is inherently stochastic, just ensure no crash.
opt = ParlaiParser(True, True).parse_kwargs(
model_file='zoo:unittest/transformer_generator2/model',
inference='delayedbeam',
topk=10,
beam_delay=2,
beam_min_length=2,
)
agent = create_agent(opt, True)
agent.observe({'text': '1\n1\n2\n2\n3\n3\n4', 'episode_done': True})
result = agent.act()
assert 'text' in result
assert result['text'] != ''
> assert '1 2' in result['text']
E AssertionError: assert '1 2' in '1 3 2 3'
self = <test_bert.TestBertModel testMethod=test_crossencoder>
@testing_utils.retry(ntries=3, log_retry=True)
def test_crossencoder(self):
valid, test = testing_utils.train_model(
dict(
task='convai2',
model='bert_ranker/cross_encoder_ranker',
num_epochs=0.002,
batchsize=1,
candidates="inline",
type_optimization="all_encoder_layers",
warmup_updates=100,
text_truncate=32,
label_truncate=32,
validation_max_exs=20,
short_final_eval=True,
)
)
# The cross encoder reaches an interesting state MUCH faster
# accuracy should be present and somewhere between 0.2 and 0.8
# (large interval so that it doesn't flake.)
> self.assertGreaterEqual(test['accuracy'], 0.03)
E AssertionError: ExactMatchMetric(0) not greater than or equal to 0.03
tests/nightly/gpu/test_bert.py:59: AssertionError
self = <test_model_chat.TestModelChat testMethod=test_base_task>
def test_base_task(self):
with testing_utils.tempdir() as tmpdir:
# Paths
expected_states_folder = os.path.join(
os.path.dirname(os.path.abspath(__file__)), 'expected_states'
)
expected_chat_data_path = os.path.join(
expected_states_folder, 'final_chat_data.json'
)
expected_state_path = os.path.join(expected_states_folder, 'state.json')
model_opt_path = os.path.join(tmpdir, 'model_opts.yaml')
chat_data_folder = os.path.join(tmpdir, 'final_chat_data')
# Create a model opt file for the fixed-response model
with open(model_opt_path, 'w') as f:
model_opt_contents = f"""\
fixed_response: >
--model fixed_response
"""
f.write(model_opt_contents)
# Set up the config and database
num_blender_convos = 10
args = ModelChatBlueprintArgs()
overrides = [
f'+mephisto.blueprint.{key}={val}'
for key, val in args.__dict__.items()
if key
in [
'max_onboard_time',
'max_resp_time',
'override_opt',
'random_seed',
'world_file',
]
] + [
'mephisto.blueprint.annotations_config_path=${task_dir}/task_config/annotations_config.json',
f'mephisto.blueprint.conversations_needed_string=\"fixed_response:{num_blender_convos:d}\"',
f'mephisto.blueprint.chat_data_folder={chat_data_folder}',
'+mephisto.blueprint.left_pane_text_path=${task_dir}/task_config/left_pane_text.html',
'+mephisto.blueprint.max_concurrent_responses=1',
f'mephisto.blueprint.model_opt_path={model_opt_path}',
f'+mephisto.blueprint.num_conversations={num_blender_convos:d}',
'+mephisto.blueprint.onboard_task_data_path=${task_dir}/task_config/onboard_task_data.json',
'+mephisto.blueprint.task_description_file=${task_dir}/task_config/task_description.html',
]
# TODO: remove all of these params once Hydra 1.1 is released with
# support for recursive defaults
self._set_up_config(
blueprint_type=BLUEPRINT_TYPE,
task_directory=TASK_DIRECTORY,
overrides=overrides,
)
# Set up the operator and server
shared_state = SharedModelChatTaskState(world_module=world_module)
self._set_up_server(shared_state=shared_state)
# Check that the agent states are as they should be
self._get_channel_info().job.task_runner.task_run.get_blueprint().use_onboarding = (
False
)
# Don't require onboarding for this test agent
with open(expected_state_path) as f:
expected_state = json.load(f)
self._test_agent_states(
num_agents=1,
agent_display_ids=AGENT_DISPLAY_IDS,
agent_messages=AGENT_MESSAGES,
form_messages=FORM_MESSAGES,
form_task_data=FORM_TASK_DATA,
expected_states=(expected_state,),
> agent_task_data=AGENT_TASK_DATA,
)
tests/crowdsourcing/tasks/model_chat/test_model_chat.py:159:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
parlai/crowdsourcing/utils/tests.py:264: in _test_agent_states
agent_ids = self._register_mock_agents(num_agents=num_agents)
parlai/crowdsourcing/utils/tests.py:170: in _register_mock_agents
agents = self.db.find_agents()
../Mephisto/mephisto/abstractions/databases/local_database.py:1076: in find_agents
return [Agent(self, str(r["agent_id"]), row=r) for r in rows]
../Mephisto/mephisto/abstractions/databases/local_database.py:1076: in <listcomp>
return [Agent(self, str(r["agent_id"]), row=r) for r in rows]
../Mephisto/mephisto/abstractions/providers/mock/mock_agent.py:31: in __init__
super().__init__(db, db_id, row=row)
../Mephisto/mephisto/data_model/agent.py:73: in __init__
self.state = AgentState(self) # type: ignore
../Mephisto/mephisto/abstractions/blueprints/parlai_chat/parlai_chat_agent_state.py:37: in __init__
self.load_data()
../Mephisto/mephisto/abstractions/blueprints/parlai_chat/parlai_chat_agent_state.py:72: in load_data
state = json.load(state_json)
/usr/local/lib/python3.7/json/__init__.py:296: in load
parse_constant=parse_constant, object_pairs_hook=object_pairs_hook, **kw)
/usr/local/lib/python3.7/json/__init__.py:348: in loads
return _default_decoder.decode(s)
/usr/local/lib/python3.7/json/decoder.py:337: in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <json.decoder.JSONDecoder object at 0x7f119dbded90>, s = '', idx = 0
def raw_decode(self, s, idx=0):
"""Decode a JSON document from ``s`` (a ``str`` beginning with
a JSON document) and return a 2-tuple of the Python
representation and the index in ``s`` where the document ended.
This can be used to decode a JSON document from a string that may
have extraneous data at the end.
"""
try:
obj, end = self.scan_once(s, idx)
except StopIteration as err:
> raise JSONDecodeError("Expecting value", s, err.value) from None
E json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
/usr/local/lib/python3.7/json/decoder.py:355: JSONDecodeError
self = <tests.test_distributed.TestDistributedEval testMethod=test_mp_eval>
def test_mp_eval(self):
args = dict(
task='integration_tests:multiturn_nocandidate',
model='seq2seq',
model_file='zoo:unittest/seq2seq/model',
dict_file='zoo:unittest/seq2seq/model.dict',
skip_generation=False,
batchsize=8,
)
valid, _ = testing_utils.eval_model(args, skip_test=True)
from parlai.scripts.multiprocessing_eval import MultiProcessEval
> valid_mp = MultiProcessEval.main(**args)
tests/test_distributed.py:218:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
parlai/core/script.py:126: in main
return cls._run_kwargs(kwargs)
parlai/core/script.py:92: in _run_kwargs
return cls._run_from_parser_and_opt(opt, parser)
parlai/core/script.py:107: in _run_from_parser_and_opt
return script.run()
parlai/scripts/multiprocessing_eval.py:92: in run
return launch_and_eval(self.opt, port)
parlai/scripts/multiprocessing_eval.py:66: in launch_and_eval
retval = multiprocess_eval(0, opt, port)
parlai/scripts/multiprocessing_eval.py:44: in multiprocess_eval
rank, opt, rank_offset, gpu, init_method=init_method
/opt/circleci/.pyenv/versions/3.7.0/lib/python3.7/contextlib.py:112: in __enter__
return next(self.gen)
parlai/utils/distributed.py:282: in distributed_context
rank=rank,
../venv/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py:436: in init_process_group
store, rank, world_size = next(rendezvous_iterator)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
url = 'tcp://localhost:42476?rank=0&world_size=2'
timeout = datetime.timedelta(seconds=1800), kwargs = {}
_error = <function _tcp_rendezvous_handler.<locals>._error at 0x7f19f85c5e18>
result = ParseResult(scheme='tcp', netloc='localhost:42476', path='', params='', query='rank=0&world_size=2', fragment='')
query = {'rank': '0', 'world_size': '2'}, rank = 0, world_size = 2
start_daemon = True
def _tcp_rendezvous_handler(url, timeout=default_pg_timeout, **kwargs):
def _error(msg):
return _rendezvous_error("tcp:// rendezvous: " + msg)
result = urlparse(url)
if not result.port:
raise _error("port number missing")
query = dict(pair.split("=") for pair in filter(None, result.query.split("&")))
if "rank" not in query:
raise _error("rank parameter missing")
if "world_size" not in query:
raise _error("world size parameter missing")
rank = int(query["rank"])
world_size = int(query["world_size"])
start_daemon = rank == 0
> store = TCPStore(result.hostname, result.port, world_size, start_daemon, timeout)
E RuntimeError: Address already in use
../venv/lib/python3.7/site-packages/torch/distributed/rendezvous.py:133: RuntimeError
self = <urllib3.response.HTTPResponse object at 0x7fe304f07d30>
@contextmanager
def _error_catcher(self):
"""
Catch low-level python exceptions, instead re-raising urllib3
variants, so that low-level exceptions are not leaked in the
high-level api.
On exit, release the connection back to the pool.
"""
clean_exit = False
try:
try:
> yield
../venv/lib/python3.7/site-packages/urllib3/response.py:436:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <urllib3.response.HTTPResponse object at 0x7fe304f07d30>, amt = 32768
decode_content = True, cache_content = False
def read(self, amt=None, decode_content=None, cache_content=False):
"""
Similar to :meth:`httplib.HTTPResponse.read`, but with two additional
parameters: ``decode_content`` and ``cache_content``.
:param amt:
How much of the content to read. If specified, caching is skipped
because it doesn't make sense to cache partial content as the full
response.
:param decode_content:
If True, will attempt to decode the body based on the
'content-encoding' header.
:param cache_content:
If True, will save the returned data such that the same result is
returned despite of the state of the underlying file object. This
is useful if you want the ``.data`` property to continue working
after having ``.read()`` the file object. (Overridden if ``amt`` is
set.)
"""
self._init_decoder()
if decode_content is None:
decode_content = self.decode_content
if self._fp is None:
return
flush_decoder = False
fp_closed = getattr(self._fp, "closed", False)
with self._error_catcher():
if amt is None:
# cStringIO doesn't like amt=None
data = self._fp.read() if not fp_closed else b""
flush_decoder = True
else:
cache_content = False
> data = self._fp.read(amt) if not fp_closed else b""
../venv/lib/python3.7/site-packages/urllib3/response.py:518:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <http.client.HTTPResponse object at 0x7fe304f07dd8>, amt = 32768
def read(self, amt=None):
if self.fp is None:
return b""
if self._method == "HEAD":
self._close_conn()
return b""
if amt is not None:
# Amount is given, implement using readinto
b = bytearray(amt)
> n = self.readinto(b)
/opt/circleci/.pyenv/versions/3.7.0/lib/python3.7/http/client.py:447:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <http.client.HTTPResponse object at 0x7fe304f07dd8>
b = bytearray(b'\x1f\x8b\x08\x00\x9a%\xef[\x00\x03\xec\xbd\xdb\x92\x1cG\x92%X\xcf\xfe\x15\x9e\xf5\x02\x96H2\x05\xbc\x15Y]\...0\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
def readinto(self, b):
"""Read up to len(b) bytes into bytearray b and return the number
of bytes read.
"""
if self.fp is None:
return 0
if self._method == "HEAD":
self._close_conn()
return 0
if self.chunked:
return self._readinto_chunked(b)
if self.length is not None:
if len(b) > self.length:
# clip the read to the "end of response"
b = memoryview(b)[0:self.length]
# we do not use _safe_read() here because this may be a .will_close
# connection, and the user is reading more bytes than will be provided
# (for example, reading in 1k chunks)
> n = self.fp.readinto(b)
/opt/circleci/.pyenv/versions/3.7.0/lib/python3.7/http/client.py:491:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <socket.SocketIO object at 0x7fe304f07da0>
b = <memory at 0x7fe304f0a108>
def readinto(self, b):
"""Read up to len(b) bytes into the writable buffer *b* and return
the number of bytes read. If the socket is non-blocking and no bytes
are available, None is returned.
If *b* is non-empty, a 0 return value indicates that the connection
was shutdown at the other end.
"""
self._checkClosed()
self._checkReadable()
if self._timeout_occurred:
raise OSError("cannot read from timed out object")
while True:
try:
> return self._sock.recv_into(b)
/opt/circleci/.pyenv/versions/3.7.0/lib/python3.7/socket.py:589:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <ssl.SSLSocket [closed] fd=-1, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=6>
buffer = <memory at 0x7fe304f0a108>, nbytes = 31183, flags = 0
def recv_into(self, buffer, nbytes=None, flags=0):
self._checkClosed()
if buffer and (nbytes is None):
nbytes = len(buffer)
elif nbytes is None:
nbytes = 1024
if self._sslobj is not None:
if flags != 0:
raise ValueError(
"non-zero flags not allowed in calls to recv_into() on %s" %
self.__class__)
> return self.read(nbytes, buffer)
/opt/circleci/.pyenv/versions/3.7.0/lib/python3.7/ssl.py:1049:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <ssl.SSLSocket [closed] fd=-1, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=6>
len = 31183, buffer = <memory at 0x7fe304f0a108>
def read(self, len=1024, buffer=None):
"""Read up to LEN bytes and return them.
Return zero-length string on EOF."""
self._checkClosed()
if self._sslobj is None:
raise ValueError("Read on closed or unwrapped SSL socket.")
try:
if buffer is not None:
> return self._sslobj.read(len, buffer)
E ConnectionResetError: [Errno 104] Connection reset by peer
/opt/circleci/.pyenv/versions/3.7.0/lib/python3.7/ssl.py:908: ConnectionResetError
During handling of the above exception, another exception occurred:
def generate():
# Special case for urllib3.
if hasattr(self.raw, 'stream'):
try:
> for chunk in self.raw.stream(chunk_size, decode_content=True):
../venv/lib/python3.7/site-packages/requests/models.py:753:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <urllib3.response.HTTPResponse object at 0x7fe304f07d30>, amt = 32768
decode_content = True
def stream(self, amt=2 ** 16, decode_content=None):
"""
A generator wrapper for the read() method. A call will block until
``amt`` bytes have been read from the connection or until the
connection is closed.
:param amt:
How much of the content to read. The generator will return up to
much data per iteration, but may return less. This is particularly
likely when using compressed data. However, the empty string will
never be returned.
:param decode_content:
If True, will attempt to decode the body based on the
'content-encoding' header.
"""
if self.chunked and self.supports_chunked_reads():
for line in self.read_chunked(amt, decode_content=decode_content):
yield line
else:
while not is_fp_closed(self._fp):
> data = self.read(amt=amt, decode_content=decode_content)
../venv/lib/python3.7/site-packages/urllib3/response.py:575:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <urllib3.response.HTTPResponse object at 0x7fe304f07d30>, amt = 32768
decode_content = True, cache_content = False
def read(self, amt=None, decode_content=None, cache_content=False):
"""
Similar to :meth:`httplib.HTTPResponse.read`, but with two additional
parameters: ``decode_content`` and ``cache_content``.
:param amt:
How much of the content to read. If specified, caching is skipped
because it doesn't make sense to cache partial content as the full
response.
:param decode_content:
If True, will attempt to decode the body based on the
'content-encoding' header.
:param cache_content:
If True, will save the returned data such that the same result is
returned despite of the state of the underlying file object. This
is useful if you want the ``.data`` property to continue working
after having ``.read()`` the file object. (Overridden if ``amt`` is
set.)
"""
self._init_decoder()
if decode_content is None:
decode_content = self.decode_content
if self._fp is None:
return
flush_decoder = False
fp_closed = getattr(self._fp, "closed", False)
with self._error_catcher():
if amt is None:
# cStringIO doesn't like amt=None
data = self._fp.read() if not fp_closed else b""
flush_decoder = True
else:
cache_content = False
> data = self._fp.read(amt) if not fp_closed else b""
../venv/lib/python3.7/site-packages/urllib3/response.py:518:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <contextlib._GeneratorContextManager object at 0x7fe304f106a0>
type = <class 'ConnectionResetError'>
value = ConnectionResetError(104, 'Connection reset by peer')
traceback = <traceback object at 0x7fe304f01f88>
def __exit__(self, type, value, traceback):
if type is None:
try:
next(self.gen)
except StopIteration:
return False
else:
raise RuntimeError("generator didn't stop")
else:
if value is None:
# Need to force instantiation so we can reliably
# tell if we get the same exception back
value = type()
try:
> self.gen.throw(type, value, traceback)
/opt/circleci/.pyenv/versions/3.7.0/lib/python3.7/contextlib.py:130:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <urllib3.response.HTTPResponse object at 0x7fe304f07d30>
@contextmanager
def _error_catcher(self):
"""
Catch low-level python exceptions, instead re-raising urllib3
variants, so that low-level exceptions are not leaked in the
high-level api.
On exit, release the connection back to the pool.
"""
clean_exit = False
try:
try:
yield
except SocketTimeout:
# FIXME: Ideally we'd like to include the url in the ReadTimeoutError but
# there is yet no clean way to get at it from this context.
raise ReadTimeoutError(self._pool, None, "Read timed out.")
except BaseSSLError as e:
# FIXME: Is there a better way to differentiate between SSLErrors?
if "read operation timed out" not in str(e): # Defensive:
# This shouldn't happen but just in case we're missing an edge
# case, let's avoid swallowing SSL errors.
raise
raise ReadTimeoutError(self._pool, None, "Read timed out.")
except (HTTPException, SocketError) as e:
# This includes IncompleteRead.
> raise ProtocolError("Connection broken: %r" % e, e)
E urllib3.exceptions.ProtocolError: ("Connection broken: ConnectionResetError(104, 'Connection reset by peer')", ConnectionResetError(104, 'Connection reset by peer'))
../venv/lib/python3.7/site-packages/urllib3/response.py:454: ProtocolError
During handling of the above exception, another exception occurred:
self = <test_transresnet_multimodal.TestTransresnet testMethod=test_transresnet>
def test_transresnet(self):
"""
Test pretrained model.
"""
> _, test = testing_utils.eval_model(MODEL_OPTIONS, skip_valid=True)
tests/nightly/gpu/test_transresnet_multimodal.py:44:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
parlai/utils/testing.py:361: in eval_model
test = None if skip_test else ems.EvalModel.main(**opt)
parlai/core/script.py:126: in main
return cls._run_kwargs(kwargs)
parlai/core/script.py:92: in _run_kwargs
return cls._run_from_parser_and_opt(opt, parser)
parlai/core/script.py:107: in _run_from_parser_and_opt
return script.run()
parlai/scripts/eval_model.py:222: in run
return eval_model(self.opt)
parlai/scripts/eval_model.py:191: in eval_model
agent = create_agent(opt, requireModelExists=True)
parlai/core/agents.py:402: in create_agent
model = create_agent_from_opt_file(opt)
parlai/core/agents.py:355: in create_agent_from_opt_file
return model_class(opt_from_file)
projects/image_chat/transresnet_multimodal/transresnet_multimodal.py:70: in __init__
super().__init__(opt, shared)
projects/personality_captions/transresnet/transresnet.py:101: in __init__
self.personalities_list = self.load_personalities()
projects/personality_captions/transresnet/transresnet.py:193: in load_personalities
build(self.opt)
parlai/tasks/personality_captions/build.py:32: in build
downloadable_file.download_file(dpath)
parlai/core/build_data.py:87: in download_file
download(self.url, dpath, self.file_name)
parlai/core/build_data.py:181: in download
for chunk in response.iter_content(CHUNK_SIZE):
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
def generate():
# Special case for urllib3.
if hasattr(self.raw, 'stream'):
try:
for chunk in self.raw.stream(chunk_size, decode_content=True):
yield chunk
except ProtocolError as e:
> raise ChunkedEncodingError(e)
E requests.exceptions.ChunkedEncodingError: ("Connection broken: ConnectionResetError(104, 'Connection reset by peer')", ConnectionResetError(104, 'Connection reset by peer'))
../venv/lib/python3.7/site-packages/requests/models.py:756: ChunkedEncodingError
self = <tests.test_tra.TestTransformerRanker testMethod=test_train_batch_all>
@testing_utils.retry(ntries=3)
def test_train_batch_all(self):
args = self._get_args()
args['candidates'] = 'batch-all-cands'
args['eval_candidates'] = 'batch-all-cands'
valid, test = testing_utils.train_model(args)
threshold = self._get_threshold()
> self.assertGreaterEqual(valid['hits@1'], threshold)
E AssertionError: AverageMetric(0.5) not greater than or equal to 0.8
tests/test_tra.py:92: AssertionError
The following test, along with several others, failed with this error
other tests:
-
test_chunked_teacher -
test_distributed_eval_max_exs -
test_distributed_eval_stream_mode -
test_distributed_eval_stream_mode_max_exs -
test_generator_distributed -
test_multitask_distributed
self = <tests.test_distributed.TestZero2 testMethod=test_chunked_dynamic_teacher>
def test_chunked_dynamic_teacher(self):
valid, test = self._distributed_train_model(
task='integration_tests',
num_epochs=0.01,
datatype='train:stream',
dynamic_batching='full',
> truncate=16,
)
tests/test_distributed.py:146:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
tests/test_distributed.py:33: in _distributed_train_model
valid, test = mp_train.launch_and_train(popt)
parlai/scripts/multiprocessing_train.py:75: in launch_and_train
retval = multiprocess_train(0, opt, port)
parlai/scripts/multiprocessing_train.py:45: in multiprocess_train
return single_train.TrainLoop(opt).train()
parlai/scripts/train_model.py:347: in __init__
self.agent = create_agent(opt)
parlai/core/agents.py:479: in create_agent
model = model_class(opt)
parlai/core/torch_generator_agent.py:484: in __init__
self.model = fsdp_utils.fsdp_wrap(self.build_model())
parlai/utils/fsdp.py:111: in fsdp_wrap
return wrap(module)
../venv/lib/python3.7/site-packages/fairscale/nn/wrap/auto_wrap.py:170: in wrap
return ConfigAutoWrap.wrapper_cls(module, **wrap_overrides)
../venv/lib/python3.7/site-packages/fairscale/nn/data_parallel/fully_sharded_data_parallel.py:300: in __init__
self._fsdp_wrapped_module: nn.Module = FlattenParamsWrapper(module, param_list=params)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = FlattenParamsWrapper(
(_fpw_module): TransformerGeneratorModel_Swappable(
(embeddings): Embedding(11, 8, padding... (norm3): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
)
)
)
)
)
)
)
module = TransformerGeneratorModel_Swappable(
(embeddings): Embedding(11, 8, padding_idx=0)
(encoder): TransformerEncoder_S... )
(norm3): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
)
)
)
)
)
)
param_list = [[Parameter containing:
tensor([[ 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
[-0.... [ 6.5029e-01, -7.5969e-01, 9.9749e-01, 7.0737e-02, 1.4944e-01,
9.8877e-01, 1.4999e-02, 9.9989e-01]])]]
def __init__(self, module: nn.Module, param_list: ParamGroups = None):
super().__init__()
self._fpw_module = module
self.is_flattened = False
# Handle param_list being None.
if param_list is None:
param_list = list(module.parameters())
# Be backward compatible and turn a single param list into a list of
# a single list.
if len(param_list) > 0 and isinstance(param_list[0], nn.Parameter):
param_list = [cast(List[nn.Parameter], param_list)]
# Since the parameters will be deleted, let's record the number original
# parameters managed by this class. This and get_param_views function
# below are used by fsdp_optim_utils.py to save/restore optimizer state,
# which mirrors the flatten parameters here.
self.num_params_managed = 0
self._param_sets = []
overall_param_set: Set[nn.Parameter] = set()
for p_list in param_list:
# Remove any duplicates from the list.
p_set: Set[nn.Parameter] = set(cast(List[nn.Parameter], p_list))
self.num_params_managed += len(p_set)
overall_param_set = overall_param_set.union(p_set)
# Convert from list of Parameters to set of (Module, name) tuples,
# which will survive in case the parameter instances are reset.
# Also, a shared param will correctly appear under multiple modules
# as they should.
new_p_set_with_names = set()
for m in self.modules():
for n, p in m.named_parameters(recurse=False):
if p in p_set:
new_p_set_with_names.add((m, n))
if new_p_set_with_names:
self._param_sets.append(new_p_set_with_names)
if len(overall_param_set) != self.num_params_managed:
# Each p_list above could have shared params. However, you can't
# have shared params cross different p_list. That means part of
# the flattened parameter must be shared, which is impossible to
# support.
raise ValueError(f"Incorrect param groups {len(overall_param_set)} vs {self.num_param_managed}")
self.flat_params: List[FlatParameter] = []
self._param_infos: List[Tuple[str, nn.Module, str]] = []
self._shared_param_infos: List[Tuple[nn.Module, str, nn.Module, str]] = []
# Init all flat_params.
for new_p_set in self._param_sets:
params = self._init_flatten_params(new_p_set)
assert (
len(set(p.requires_grad for p in params)) == 1
> ), "expects all parameters in the same parameter group of the module to have same requires_grad"
E AssertionError: expects all parameters in the same parameter group of the module to have same requires_grad
../venv/lib/python3.7/site-packages/fairscale/nn/misc/flatten_params_wrapper.py:187: AssertionError