
| Current Path : /proc/thread-self/root/usr/local/lib/python3.8/dist-packages/silverware/ |
Linux ift1.ift-informatik.de 5.4.0-216-generic #236-Ubuntu SMP Fri Apr 11 19:53:21 UTC 2025 x86_64 |
| Current File : //proc/thread-self/root/usr/local/lib/python3.8/dist-packages/silverware/clean_html_text.py |
from .clone_beautiful_soup_tag import clone_beautiful_soup_tag
def replace_images_with_text(element):
html = clone_beautiful_soup_tag(element)
for elem in html.find_all('img'):
if 'alt' in elem.attrs:
elem.string_for_display = elem.attrs['alt']
return html
def clean_html_text(html, replace_images=False):
if isinstance(html, str):
return html
elif html is None:
return None
else:
if replace_images:
html = replace_images_with_text(element=html)
text = html.get_text()
return ' '.join(text.replace('\n', ' ').replace('\xa0', ' ').strip().split())