h2ogpt
h2ogpt copied to clipboard
I have many documents on my website, how to add all of them to the database?
I have many documents on my website, and I want to add all of them to the database and return the URL of the webpage where the document is located in response to a chat query. How can I implement this?
This sounds like an amazing feature
Maybe it can be implemented in a different way, like, for example, having an optional metadata file that maps each file to a URL or a location, and then download all the HTML files through some Linux utility and create that metadata file. This way it would also be useful not only for websites but also for other documentation software like notes, notion, OneNote...
Yes, it's a good idea. Right now you can provide list of urls as sources, but having it scrape all docs on a page makes more sense most of time. I've done that before for another project using bs4.
One example, where goal is to zip-up logs.
def download_path(url_base, dest_path=None, log_pattern=False):
import bs4
r = requests.get(url_base)
data = bs4.BeautifulSoup(r.text, "html.parser")
urls = [x.get('href') for x in data.find_all("a")]
urls = [x for x in urls if x is not None]
if log_pattern:
urls = [x for x in urls if '.log' in x and 'anonymized' not in x]
urls = [x for x in urls if x.endswith('.log')]
for url in urls:
zip_dest_expected = os.path.join(dest_path, '%s.zip' % url)
if not os.path.isfile(zip_dest_expected):
dest_expected = os.path.join(dest_path, url)
if not os.path.isfile(dest_expected):
dest = download(url_base + "/" + url, dest_path=dest_path)
else:
dest = dest_expected
os.system("zip %s.zip %s" % (dest, dest))
remove_simple(dest)
Run like:
download_path("http://mr-0xg1:8080/job/dai-native-pipeline-nightly/job/dev/%s/artifact/test_benchmark_openml%d-x86_64-11.2.2/tmp/h2oaiopenml/" % (job, suite), dest_path=dest_path, log_pattern=True)
which scans through that page and gets all log files and zips them up.
Another fragment of use is this one, where for DriverlessAI we pull custom recipes from github that is a tree of locations:
def get_urls_or_files(self, url):
# convert various kinds of formats of what is passed to a list of urls or files
if(' ' in url):
check_urls = url.split(' ')
else:
check_urls = [url]
for i in check_urls:
if(not ContribLoader.valid_recipe_source(i)):
err = f'Source for custom recipe does not match locked repository {config.custom_recipes_git_repo}'
if(config.custom_recipes_git_branch != "None"):
err += f' and branch {config.custom_recipes_git_branch}'
raise ValueError(err)
if url.startswith("file://") and os.path.isfile(url[7:]):
return [], [url[7:]]
elif (url.startswith("http://github.com") or url.startswith(
"https://github.com")) and '.py' not in url and ' ' not in url:
struuid = str(uuid.uuid4())
saved_path = "githubrepo%s" % struuid
saved_path = os.path.join(self._root_dir, saved_path)
if '/tree/' in url:
url_repo = url.split('/tree/')[0]
branch_hash = "/tree".join(url.split('/tree/')[1:]).split('/')[0]
rel_path = "/".join("/tree".join(url.split('/tree/')[1:]).split('/')[1:])
else:
url_repo = url
branch_hash = None
rel_path = "/"
from git.repo.base import Repo # local import to avoid "Bad git executable" during python scoring on non-conda setup
repo = Repo.clone_from(url_repo, saved_path)
if branch_hash is not None:
repo.git.checkout(branch_hash)
files_to_return = []
for (path, dirs, files) in os.walk(saved_path):
if os.path.join(saved_path, rel_path) not in path:
continue
for file in files:
filename = os.path.join(path, file)
for name in self.names:
# models may have transformers in them etc.
if filename.endswith('.py') and \
"/" + name + "/" in filename and \
name + "_template" not in filename and \
name[0:-1] + "_template" not in filename and \
"how_to" not in filename and \
os.path.basename(filename) not in config.custom_recipes_excluded_filenames_from_repo_download:
files_to_return.append(filename)
return [], files_to_return
elif all(['.py' in x for x in url.split(' ')]):
# assume string of "file1.py file2.py etc."
return url.split(' '), []
else:
hrefs = self.get_hrefs(url)
hrefs_py = [x for x in hrefs if x.endswith(".py")]
links_py = [x if url in x else url + x for x in hrefs_py]
# this is last resort if all other types fail, and so provide error if found nothing
if len(links_py) == 0:
raise ValueError("No python code (links ending in .py) found in url %s " % sanitize_github_url(url))
return links_py, []
def get_hrefs(self, url):
import httplib2
http = httplib2.Http()
status, response = http.request(url)
from bs4 import BeautifulSoup, SoupStrainer
links = []
if status.status == 200:
for link in BeautifulSoup(response, parse_only=SoupStrainer('a')):
if hasattr(link, 'href'):
links.append(link['href'])
return links
def get_url_data(self, url):
logger_url = sanitize_github_url(url)
msg = []
url_data = None
url = url.strip()
# convenience mapping from HTML pages in github to raw source
if "/github.com/" in url and "/blob/" in url:
url = url.replace("/github.com/", "/raw.githubusercontent.com/")
url = url.replace("/blob/", "/")
loggerwarning(self.logger,
"Automatically converting from HTML github page to raw source code URI: %s" % logger_url)
if "h2oai/driverlessai-recipes/master" in url:
msg.append(master_recipe_used)
trials = config.get_url_data_trials
for trial in range(trials):
try:
request_kwargs = networking.get_common_request_kwargs(config)
url_data = requests.get(url, **request_kwargs)
break
except requests.exceptions.ConnectionError as e:
msg1 = "Problem with connecting to %s Error: %s" % (str(logger_url), str(e))
loggerwarning(self.logger, msg1)
if trial == trials - 1:
raise
else:
time.sleep(config.get_url_data_sleep)
if url_data is None or url_data.status_code != requests.codes.ok:
if url_data is not None:
msg1 = "Cannot get url %s, code: %s, reason: %s" % (
str(logger_url), str(url_data.status_code), str(url_data.reason))
loggerwarning(self.logger, msg1)
msg.append(msg1)
if True or config.hard_asserts:
raise requests.exceptions.RequestException(msg1)
else:
return url_data
return url_data, msg
def _load_custom_blueprints_url(self, url: str, load_global_packages_only=False):
""" Load custom contrib code from (remote) location, such as GIT repository
"""
logger_url = sanitize_github_url(url)
msg = []
try:
if len(url) > 0:
content = None
url_data = None
if content is None:
url_data, msg_list1 = self.get_url_data(url)
msg.extend(msg_list1)
# dump url content to temporary file
file = None
try:
file_name = url.rsplit("/", 1)[1]
if '.py' != file_name[-3:] and '.py' in file_name:
# force .py ending in case token or other stuff there
file_name += ".py"
content = url_data.text
path = self._root_dir.joinpath("tmp", file_name)
if not os.path.isfile(path):
self.atomic_copy(dst=path, content=content)
file = os.path.relpath(path)
base_name_file = self.sanitize_identifier(os.path.basename(file).split('.py')[0])
msg_list = self._load_custom_blueprints_file(path=file, base_name=base_name_file,
load_global_packages_only=load_global_packages_only)
msg.extend(msg_list)
except FileNotFoundError as e:
t, v, tb = sys.exc_info()
ex = ''.join(traceback.format_exception(t, v, tb))
if config.hard_asserts:
ee = ex
else:
ee = e
loggerwarning(self.logger,
'Loading custom %s from %s failed: %s. Potential race, so not failing presuming another '
'process did reasonable thing to the file (i.e. deleted it). During _load_custom_blueprints_url. global=%s' % (
self.name, file, str(ee), load_global_packages_only))
finally:
remove(file)
else:
msg1 = 'Loading custom %s from GIT recipe failed. Invalid recipe URL=%s' % (self.name, logger_url)
loggerwarning(self.logger, msg1)
if config.hard_asserts:
raise RuntimeError(msg1)
msg.append(msg1)
except requests.exceptions.RequestException as e:
msg1 = 'Loading custom %s from url %s failed: %s' % (self.name, logger_url, str(e))
loggerwarning(self.logger, msg1)
msg.append(msg1)
raise
except Exception as e:
msg1 = 'Loading custom %s from url %s failed: %s' % (self.name, logger_url, str(e))
loggerwarning(self.logger, msg1)
if config.hard_asserts:
raise
msg.append(msg1)
return msg
For some url
, the above is called like:
if url:
urls, files = self.get_urls_or_files(url)
for url in urls:
msg_list1 = self._load_custom_blueprints_url(url, load_global_packages_only=load_global_packages_only)
msg.extend(msg_list1)