Spaces:
Paused
Paused
| # Copyright 2019 The TensorFlow Authors. All Rights Reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| # ============================================================================== | |
| """Private utilities for managing multiple TensorBoard processes.""" | |
| import base64 | |
| import dataclasses | |
| import datetime | |
| import errno | |
| import json | |
| import os | |
| import subprocess | |
| import tempfile | |
| import time | |
| import typing | |
| from typing import Optional | |
| from tensorboard import version | |
| from tensorboard.util import tb_logging | |
| class TensorBoardInfo: | |
| """Holds the information about a running TensorBoard instance. | |
| Attributes: | |
| version: Version of the running TensorBoard. | |
| start_time: Seconds since epoch. | |
| pid: ID of the process running TensorBoard. | |
| port: Port on which TensorBoard is running. | |
| path_prefix: Relative prefix to the path, may be empty. | |
| logdir: Data location used by the TensorBoard server, may be empty. | |
| db: Database connection used by the TensorBoard server, may be empty. | |
| cache_key: Opaque, as given by `cache_key` below. | |
| """ | |
| version: str | |
| start_time: int | |
| pid: int | |
| port: int | |
| path_prefix: str | |
| logdir: str | |
| db: str | |
| cache_key: str | |
| def data_source_from_info(info): | |
| """Format the data location for the given TensorBoardInfo. | |
| Args: | |
| info: A TensorBoardInfo value. | |
| Returns: | |
| A human-readable string describing the logdir or database connection | |
| used by the server: e.g., "logdir /tmp/logs". | |
| """ | |
| if info.db: | |
| return "db %s" % info.db | |
| else: | |
| return "logdir %s" % info.logdir | |
| def _info_to_string(info): | |
| """Convert a `TensorBoardInfo` to string form to be stored on disk. | |
| The format returned by this function is opaque and should only be | |
| interpreted by `_info_from_string`. | |
| Args: | |
| info: A valid `TensorBoardInfo` object. | |
| Raises: | |
| ValueError: If any field on `info` is not of the correct type. | |
| Returns: | |
| A string representation of the provided `TensorBoardInfo`. | |
| """ | |
| field_name_to_type = typing.get_type_hints(TensorBoardInfo) | |
| for key, field_type in field_name_to_type.items(): | |
| if not isinstance(getattr(info, key), field_type): | |
| raise ValueError( | |
| "expected %r of type %s, but found: %r" | |
| % (key, field_type, getattr(info, key)) | |
| ) | |
| if info.version != version.VERSION: | |
| raise ValueError( | |
| "expected 'version' to be %r, but found: %r" | |
| % (version.VERSION, info.version) | |
| ) | |
| json_value = dataclasses.asdict(info) | |
| return json.dumps(json_value, sort_keys=True, indent=4) | |
| def _info_from_string(info_string): | |
| """Parse a `TensorBoardInfo` object from its string representation. | |
| Args: | |
| info_string: A string representation of a `TensorBoardInfo`, as | |
| produced by a previous call to `_info_to_string`. | |
| Returns: | |
| A `TensorBoardInfo` value. | |
| Raises: | |
| ValueError: If the provided string is not valid JSON, or if it is | |
| missing any required fields, or if any field is of incorrect type. | |
| """ | |
| field_name_to_type = typing.get_type_hints(TensorBoardInfo) | |
| try: | |
| json_value = json.loads(info_string) | |
| except ValueError: | |
| raise ValueError("invalid JSON: %r" % (info_string,)) | |
| if not isinstance(json_value, dict): | |
| raise ValueError("not a JSON object: %r" % (json_value,)) | |
| expected_keys = frozenset(field_name_to_type.keys()) | |
| actual_keys = frozenset(json_value) | |
| missing_keys = expected_keys - actual_keys | |
| if missing_keys: | |
| raise ValueError( | |
| "TensorBoardInfo missing keys: %r" % (sorted(missing_keys),) | |
| ) | |
| # For forward compatibility, silently ignore unknown keys. | |
| # Validate and deserialize fields. | |
| fields = {} | |
| for key, field_type in field_name_to_type.items(): | |
| if not isinstance(json_value[key], field_type): | |
| raise ValueError( | |
| "expected %r of type %s, but found: %r" | |
| % (key, field_type, json_value[key]) | |
| ) | |
| fields[key] = json_value[key] | |
| return TensorBoardInfo(**fields) | |
| def cache_key(working_directory, arguments, configure_kwargs): | |
| """Compute a `TensorBoardInfo.cache_key` field. | |
| The format returned by this function is opaque. Clients may only | |
| inspect it by comparing it for equality with other results from this | |
| function. | |
| Args: | |
| working_directory: The directory from which TensorBoard was launched | |
| and relative to which paths like `--logdir` and `--db` are | |
| resolved. | |
| arguments: The command-line args to TensorBoard, as `sys.argv[1:]`. | |
| Should be a list (or tuple), not an unparsed string. If you have a | |
| raw shell command, use `shlex.split` before passing it to this | |
| function. | |
| configure_kwargs: A dictionary of additional argument values to | |
| override the textual `arguments`, with the same semantics as in | |
| `tensorboard.program.TensorBoard.configure`. May be an empty | |
| dictionary. | |
| Returns: | |
| A string such that if two (prospective or actual) TensorBoard | |
| invocations have the same cache key then it is safe to use one in | |
| place of the other. The converse is not guaranteed: it is often safe | |
| to change the order of TensorBoard arguments, or to explicitly set | |
| them to their default values, or to move them between `arguments` | |
| and `configure_kwargs`, but such invocations may yield distinct | |
| cache keys. | |
| """ | |
| if not isinstance(arguments, (list, tuple)): | |
| raise TypeError( | |
| "'arguments' should be a list of arguments, but found: %r " | |
| "(use `shlex.split` if given a string)" % (arguments,) | |
| ) | |
| datum = { | |
| "working_directory": working_directory, | |
| "arguments": arguments, | |
| "configure_kwargs": configure_kwargs, | |
| } | |
| raw = base64.b64encode( | |
| json.dumps(datum, sort_keys=True, separators=(",", ":")).encode("utf-8") | |
| ) | |
| # `raw` is of type `bytes`, even though it only contains ASCII | |
| # characters; we want it to be `str` in both Python 2 and 3. | |
| return str(raw.decode("ascii")) | |
| def _get_info_dir(): | |
| """Get path to directory in which to store info files. | |
| The directory returned by this function is "owned" by this module. If | |
| the contents of the directory are modified other than via the public | |
| functions of this module, subsequent behavior is undefined. | |
| The directory will be created if it does not exist. | |
| """ | |
| path = os.path.join(tempfile.gettempdir(), ".tensorboard-info") | |
| try: | |
| os.makedirs(path) | |
| except OSError as e: | |
| if e.errno == errno.EEXIST and os.path.isdir(path): | |
| pass | |
| else: | |
| raise | |
| else: | |
| os.chmod(path, 0o777) | |
| return path | |
| def _get_info_file_path(): | |
| """Get path to info file for the current process. | |
| As with `_get_info_dir`, the info directory will be created if it | |
| does not exist. | |
| """ | |
| return os.path.join(_get_info_dir(), "pid-%d.info" % os.getpid()) | |
| def write_info_file(tensorboard_info): | |
| """Write TensorBoardInfo to the current process's info file. | |
| This should be called by `main` once the server is ready. When the | |
| server shuts down, `remove_info_file` should be called. | |
| Args: | |
| tensorboard_info: A valid `TensorBoardInfo` object. | |
| Raises: | |
| ValueError: If any field on `info` is not of the correct type. | |
| """ | |
| payload = "%s\n" % _info_to_string(tensorboard_info) | |
| with open(_get_info_file_path(), "w") as outfile: | |
| outfile.write(payload) | |
| def remove_info_file(): | |
| """Remove the current process's TensorBoardInfo file, if it exists. | |
| If the file does not exist, no action is taken and no error is | |
| raised. | |
| """ | |
| try: | |
| os.unlink(_get_info_file_path()) | |
| except OSError as e: | |
| if e.errno == errno.ENOENT: | |
| # The user may have wiped their temporary directory or something. | |
| # Not a problem: we're already in the state that we want to be in. | |
| pass | |
| else: | |
| raise | |
| def get_all(): | |
| """Return TensorBoardInfo values for running TensorBoard processes. | |
| This function may not provide a perfect snapshot of the set of running | |
| processes. Its result set may be incomplete if the user has cleaned | |
| their /tmp/ directory while TensorBoard processes are running. It may | |
| contain extraneous entries if TensorBoard processes exited uncleanly | |
| (e.g., with SIGKILL or SIGQUIT). | |
| Entries in the info directory that do not represent valid | |
| `TensorBoardInfo` values will be silently ignored. | |
| Returns: | |
| A fresh list of `TensorBoardInfo` objects. | |
| """ | |
| info_dir = _get_info_dir() | |
| results = [] | |
| for filename in os.listdir(info_dir): | |
| filepath = os.path.join(info_dir, filename) | |
| try: | |
| with open(filepath) as infile: | |
| contents = infile.read() | |
| except IOError as e: | |
| if e.errno == errno.EACCES: | |
| # May have been written by this module in a process whose | |
| # `umask` includes some bits of 0o444. | |
| continue | |
| else: | |
| raise | |
| try: | |
| info = _info_from_string(contents) | |
| except ValueError: | |
| # Ignore unrecognized files, logging at debug only. | |
| tb_logging.get_logger().debug( | |
| "invalid info file: %r", | |
| filepath, | |
| exc_info=True, | |
| ) | |
| else: | |
| results.append(info) | |
| return results | |
| class StartReused: | |
| """Possible return value of the `start` function. | |
| Indicates that a call to `start` was compatible with an existing | |
| TensorBoard process, which can be reused according to the provided | |
| info. | |
| Attributes: | |
| info: A `TensorBoardInfo` object. | |
| """ | |
| info: TensorBoardInfo | |
| class StartLaunched: | |
| """Possible return value of the `start` function. | |
| Indicates that a call to `start` successfully launched a new | |
| TensorBoard process, which is available with the provided info. | |
| Attributes: | |
| info: A `TensorBoardInfo` object. | |
| """ | |
| info: TensorBoardInfo | |
| class StartFailed: | |
| """Possible return value of the `start` function. | |
| Indicates that a call to `start` tried to launch a new TensorBoard | |
| instance, but the subprocess exited with the given exit code and | |
| output streams. (If the contents of the output streams are no longer | |
| available---e.g., because the user has emptied /tmp/---then the | |
| corresponding values will be `None`.) | |
| Attributes: | |
| exit_code: As `Popen.returncode` (negative for signal). | |
| stdout: Error message to stdout if the stream could not be read. | |
| stderr: Error message to stderr if the stream could not be read. | |
| """ | |
| exit_code: int | |
| stdout: Optional[str] | |
| stderr: Optional[str] | |
| class StartExecFailed: | |
| """Possible return value of the `start` function. | |
| Indicates that a call to `start` failed to invoke the subprocess. | |
| Attributes: | |
| os_error: `OSError` due to `Popen` invocation. | |
| explicit_binary: If the TensorBoard executable was chosen via the | |
| `TENSORBOARD_BINARY` environment variable, then this field contains | |
| the path to that binary; otherwise `None`. | |
| """ | |
| os_error: OSError | |
| explicit_binary: Optional[str] | |
| class StartTimedOut: | |
| """Possible return value of the `start` function. | |
| Indicates that a call to `start` launched a TensorBoard process, but | |
| that process neither exited nor wrote its info file within the allowed | |
| timeout period. The process may still be running under the included | |
| PID. | |
| Attributes: | |
| pid: ID of the process running TensorBoard. | |
| """ | |
| pid: int | |
| def start(arguments, timeout=datetime.timedelta(seconds=60)): | |
| """Start a new TensorBoard instance, or reuse a compatible one. | |
| If the cache key determined by the provided arguments and the current | |
| working directory (see `cache_key`) matches the cache key of a running | |
| TensorBoard process (see `get_all`), that process will be reused. | |
| Otherwise, a new TensorBoard process will be spawned with the provided | |
| arguments, using the `tensorboard` binary from the system path. | |
| Args: | |
| arguments: List of strings to be passed as arguments to | |
| `tensorboard`. (If you have a raw command-line string, see | |
| `shlex.split`.) | |
| timeout: `datetime.timedelta` object describing how long to wait for | |
| the subprocess to initialize a TensorBoard server and write its | |
| `TensorBoardInfo` file. If the info file is not written within | |
| this time period, `start` will assume that the subprocess is stuck | |
| in a bad state, and will give up on waiting for it and return a | |
| `StartTimedOut` result. Note that in such a case the subprocess | |
| will not be killed. Default value is 60 seconds. | |
| Returns: | |
| A `StartReused`, `StartLaunched`, `StartFailed`, or `StartTimedOut` | |
| object. | |
| """ | |
| this_cache_key = cache_key( | |
| working_directory=os.getcwd(), | |
| arguments=arguments, | |
| configure_kwargs={}, | |
| ) | |
| match = _find_matching_instance(this_cache_key) | |
| if match: | |
| return StartReused(info=match) | |
| (stdout_fd, stdout_path) = tempfile.mkstemp(prefix=".tensorboard-stdout-") | |
| (stderr_fd, stderr_path) = tempfile.mkstemp(prefix=".tensorboard-stderr-") | |
| start_time_seconds = time.time() | |
| explicit_tb = os.environ.get("TENSORBOARD_BINARY", None) | |
| try: | |
| p = subprocess.Popen( | |
| ["tensorboard" if explicit_tb is None else explicit_tb] + arguments, | |
| stdout=stdout_fd, | |
| stderr=stderr_fd, | |
| ) | |
| except OSError as e: | |
| return StartExecFailed(os_error=e, explicit_binary=explicit_tb) | |
| finally: | |
| os.close(stdout_fd) | |
| os.close(stderr_fd) | |
| poll_interval_seconds = 0.5 | |
| end_time_seconds = start_time_seconds + timeout.total_seconds() | |
| while time.time() < end_time_seconds: | |
| time.sleep(poll_interval_seconds) | |
| subprocess_result = p.poll() | |
| if subprocess_result is not None: | |
| return StartFailed( | |
| exit_code=subprocess_result, | |
| stdout=_maybe_read_file(stdout_path), | |
| stderr=_maybe_read_file(stderr_path), | |
| ) | |
| info = _find_matching_instance(this_cache_key) | |
| if info: | |
| # Don't check that `info.pid == p.pid`, since on Windows that may | |
| # not be the case: see #4300. | |
| return StartLaunched(info=info) | |
| else: | |
| return StartTimedOut(pid=p.pid) | |
| def _find_matching_instance(cache_key): | |
| """Find a running TensorBoard instance compatible with the cache key. | |
| Returns: | |
| A `TensorBoardInfo` object, or `None` if none matches the cache key. | |
| """ | |
| infos = get_all() | |
| candidates = [info for info in infos if info.cache_key == cache_key] | |
| for candidate in sorted(candidates, key=lambda x: x.port): | |
| # TODO(@wchargin): Check here that the provided port is still live. | |
| return candidate | |
| return None | |
| def _maybe_read_file(filename): | |
| """Read the given file, if it exists. | |
| Args: | |
| filename: A path to a file. | |
| Returns: | |
| A string containing the file contents, or `None` if the file does | |
| not exist. | |
| """ | |
| try: | |
| with open(filename) as infile: | |
| return infile.read() | |
| except IOError as e: | |
| if e.errno == errno.ENOENT: | |
| return None | |