Welcome To Our Shell

Mister Spy & Souheyl Bypass Shell

Current Path : /proc/thread-self/root/usr/local/lib/python3.8/dist-packages/silverware/htmltext/

Linux ift1.ift-informatik.de 5.4.0-216-generic #236-Ubuntu SMP Fri Apr 11 19:53:21 UTC 2025 x86_64
Upload File :
Current File : //proc/thread-self/root/usr/local/lib/python3.8/dist-packages/silverware/htmltext/BaseHtmlText.py

from bs4 import Tag
from txt import wrap
from pandas import DataFrame
from joblib import Parallel
from joblib import delayed
from slytherin.numbers import divide_into_almost_equal_parts

from .get_font_size import get_font_size
from .get_font_size import get_style
from .get_children import get_children
from .get_text import get_text


class BaseHtmlText:
	def __init__(self, obj, _id, extract_tags, ignore_function=None, depth=0, ids=None, log=None, num_jobs=1):
		"""
		:type obj: 				Tag or BaseHtmlText or Pdf
		:type ignore_function: 	Callable or NoneType
		"""
		if isinstance(obj, BaseHtmlText):
			ignore_function = ignore_function or obj._ignore_function
			depth = max(depth, obj._production_depth)
			obj = obj._tag
			if extract_tags:
				obj._tag = None
		else:
			try:
				if extract_tags:
					obj = obj.html_file.extract_soup()
				else:
					obj = obj.html_file.soup
			except AttributeError:
				pass

		self._tag = obj
		self._parent = None
		self._children = []
		self._string = None
		self._num_strings = 0
		self._font_size = get_font_size(tag=obj)
		self._style = get_style(tag=obj)
		self._complete_style = None
		self._production_depth = depth
		self._ignore_function = ignore_function
		self._index = None
		self._ids = ids or {}
		self._num_jobs = num_jobs

		if log is None or log is False or log == 0:
			self._logs = None
		elif isinstance(log, list):
			self._logs = log
		else:
			self._logs = []

		self._id = str(_id)
		self._ids[self.id] = self
		self.parse(tag=obj, extract_tags=extract_tags)

	@classmethod
	def _get_state_attribute_names(cls):
		return {
			'_children': [],
			'_string': None,
			'_num_strings': 0,
			'_font_size': None,
			'_style': None,
			'_complete_style': None,
			'_production_depth': 0,
			'_index': None,
			'_id': 1,
			'_num_jobs': 1
		}

	@classmethod
	def _get_empty_attribute_names(cls):
		return {
			'_tag': None,
			'_parent': None,
			'_ignore_function': None,
			'_logs': [],
			'_ids': {}
		}

	def __repr__(self):
		return f'<HtmlText: {str(self)}>'

	def __str__(self):
		result = self.string
		for child in self.children:
			result += ' ' + str(child)

		if len(result) > 64:
			return f'{result[:60]} ...'
		else:
			return result

	def __getstate__(self):
		return {key: getattr(self, key) for key in self._get_state_attribute_names().keys()}

	def __setstate__(self, state):
		dictionary = {**self._get_empty_attribute_names(), **self._get_empty_attribute_names(), **state}

		for key, value in dictionary.items():
			try:
				setattr(self, key, value)
			except AttributeError:
				print(f"can't set attribute \"{key}\" to \"{value}\"")
				raise AttributeError(f"can't set attribute \"{key}\" to \"{value}\"")

		for child in self.children:
			child._parent = self

	def _set_logs(self, logs):
		self._logs = logs
		for child in self.children:
			child._set_logs(logs=logs)

	def _set_ids(self, ids):
		self._ids = ids
		for child in self.children:
			child._set_ids(ids=ids)

	def delete(self, by):
		if self.id is not None:
			self.log('delete', by=by)
			self._ids[self.id] = None
		if self.parent:
			try:
				self.parent.children.remove(self)
			except ValueError:
				pass

	@property
	def id(self):
		"""
		:rtype: int or NoneType
		"""
		return self._id

	@property
	def logs(self):
		"""
		:rtype: DataFrame or NoneType
		"""
		if self._logs is None:
			return None
		else:
			return DataFrame.from_records(self._logs)

	def log(self, message, by):
		if self._logs is not None and self.id is not None:
			self._logs.append({'id': self.id, 'message': message, 'by': by.id})

	@property
	def depth(self):
		"""
		:rtype: int
		"""
		if self.parent is not None:
			return self.parent.depth + 1
		else:
			return 0

	@property
	def string(self):
		"""
		:rtype: str
		"""
		if self._string is None:
			return ''
		else:
			return self._string.strip()

	@string.setter
	def string(self, string):
		"""
		:type string: str
		"""
		if not isinstance(string, str):
			raise TypeError(f'{string} is of type {type(string)}')
		if self._string is not None:
			raise ValueError(f'cannot overwrite string "{self._string}" with "{string}"')
		self.log(message=f'add string "{string}"', by=self)
		self._string = string
		self._num_strings = 1

	@property
	def string_for_display(self):
		"""
		:rtype: str
		"""
		if self.string == '':
			return ''
		else:
			string = self.string
			if len(string) > 0 and self.font_size is not None:
				return f'{self.index_string} {string} ({self.font_size})'
			else:
				return string

	def ignore_string(self, string):
		if self._ignore_function is None:
			return False
		else:
			return self._ignore_function(string)

	@property
	def tag(self):
		"""
		:rtype: Tag
		"""
		return self._tag

	@property
	def font_size(self):
		"""
		:rtype: int or float or NoneType
		"""
		if self.parent is None:
			return self._font_size
		else:
			return self._font_size or self.parent.font_size

	@property
	def style(self):
		"""
		:rtype: dict[str, str]
		"""
		if self._complete_style is None:
			if self.parent is None or self.parent.style is None:
				self._complete_style = self._style
			elif self._style is None:
				self._complete_style = self.parent.style
			else:
				self._complete_style = {**self.parent.style, **self._style}
		return self._complete_style

	@property
	def children_font_size_max(self):
		if self.num_children == 0:
			return None
		else:
			font_sizes = [
				child.font_size if child.font_size is not None else child.children_font_size_max
				for child in self.children
			]
			font_sizes = [x for x in font_sizes if x is not None]
			if len(font_sizes) > 0:
				return max(font_sizes)
			else:
				return None

	@property
	def children(self):
		"""
		:rtype: list[BaseHtmlText] or None
		"""
		return self._children

	@property
	def parent(self):
		"""
		:rtype: BaseHtmlText or NoneType
		"""
		return self._parent

	@parent.setter
	def parent(self, parent):
		"""
		:type parent: BaseHtmlText
		"""
		if not isinstance(parent, BaseHtmlText):
			raise TypeError(f'parent {parent.id} is of type {type(parent)}')
		parent.log(f'become parent of {self.id}', by=self)
		self._parent = parent
		self._index = parent.num_children
		parent.children.append(self)

	@property
	def index(self):
		if self.parent is None:
			return 0
		elif self._index is None:
			return -1
		else:
			return self._index

	@property
	def index_string(self):
		if self.parent is not None:
			max_width = len(str(self.parent.num_children - 1))
			return str(self.index).rjust(max_width, '0')
		else:
			return ''

	@property
	def num_children(self):
		if self._children is None:
			return 0
		else:
			return len(self._children)

	@property
	def num_siblings(self):
		if self.parent is None:
			return 0
		else:
			return self.parent.num_children - 1

	@property
	def rank_among_siblings(self):
		if self.parent is None:
			return 1
		else:
			try:
				return self.parent.children.index(self) + 1
			except ValueError:
				return -1

	@property
	def last_child(self):
		"""
		:rtype: BaseHtmlText or NoneType
		"""
		if self.num_children == 0:
			return None
		else:
			return self.children[-1]

	def is_just_text(self):
		return self._string is not None and self.num_children == 0

	def is_only_parent(self):
		return self._string is None and self.num_children > 0

	def parse(self, tag, extract_tags):
		tag_children = get_children(tag, ignore_single_children=False, extract=extract_tags)
		if tag_children is None:
			tag_children = []
		if len(tag_children) == 0:
			self.string = (get_text(tag) or '').lstrip('-*\t .')

		else:
			self.parse_children(tag_children=tag_children, extract_tags=extract_tags)

	def absorb_children(self, other, by):
		"""
		:type other: BaseHtmlText
		:type by: BaseHtmlText
		"""
		self.log(message=f'absorb children of {other}', by=by)
		for child in other.children:
			child.parent = self
		other._children = []
		other.delete(by=by)

	def join_text_children(self):
		self.log(message='join text children', by=self)
		children = self.children
		self._children = []

		for child in children:
			prev_child = self.last_child
			if prev_child is None:
				child.parent = self

			# absorb
			elif prev_child.font_size == child.font_size or prev_child.font_size is None or child.font_size is None:
				prev_child._string += '\n' + child._string
				prev_child._num_strings = prev_child._num_strings + child._num_strings
				child._num_strings = 0
				child.delete(by=self)
				prev_child._font_size = prev_child._font_size or child._font_size

	def absorb_font_and_style(self, html_text):
		"""
		:type html_text: BaseHtmlText
		"""
		self._font_size = html_text._font_size or self._font_size
		if self._style is None:
			self._style = html_text._style
		elif html_text._style is None:
			pass
		else:
			self._style = {**html_text._style, **self._style}

	def absorb_grand_children(self):
		if len(self.children) != 1:
			raise RuntimeError('cannot absorb grand children when there are more than one child')

		single_child = self.children[0]

		if single_child.is_just_text() and self.string == '':
			self.log(message=f'absorb text from child {single_child.id}', by=self)
			self._children = []
			self.string = single_child._string
			self._num_strings = single_child._num_strings
			single_child._string = None
			single_child._num_strings = 0
			self.absorb_font_and_style(html_text=single_child)
			single_child.delete(by=self)

		elif single_child.is_only_parent():
			self.log(message=f'absorb grand children from child {single_child.id}', by=self)
			self._children = []
			for grand_child in single_child.children:
				grand_child.parent = self
			self.absorb_font_and_style(html_text=single_child)
			single_child.delete(by=self)

		# if single child is both parent and string
		elif self.string == '':
			self.log(message=f'absorb text from child {single_child.id}', by=self)
			self.string = single_child.string
			self._num_strings = single_child._num_strings
			single_child._string = None
			single_child._num_strings = 0
			for grand_child in single_child.children:
				grand_child.parent = self
			self.absorb_font_and_style(html_text=single_child)
			single_child.delete(by=self)
		else:
			self.log(message=f'cannot absorb anything from child {single_child.id}', by=self)

	def _get_tag_and_child(self, tag, num_jobs, child_id, extract_tags):
		base_html_text = BaseHtmlText(
			obj=tag,
			_id=f'{self.id}.{child_id}',
			depth=self._production_depth + 1,
			log=self._logs,
			ids=self._ids,
			extract_tags=extract_tags,
			num_jobs=num_jobs
		)
		return tag, base_html_text

	def get_tags_and_children(self, tag_children, extract_tags):
		num_jobs_remaining = self._num_jobs - 1
		num_jobs_per_child = divide_into_almost_equal_parts(number=num_jobs_remaining, by=len(tag_children))
		child_ids = [i + 1 for i in range(len(tag_children))]

		if self._num_jobs < 2:
			tags_and_children = [
				self._get_tag_and_child(tag=_tag, num_jobs=1, child_id=_child_id, extract_tags=extract_tags)
				for _tag, _child_id in zip(tag_children, child_ids)
			]

		elif any([x < 1 for x in num_jobs_per_child]):
			tags_and_children = [
				self._get_tag_and_child(tag=_tag, num_jobs=1, child_id=_child_id, extract_tags=extract_tags)
				for _tag, _child_id in zip(tag_children, child_ids)
			]

		else:
			tags_and_num_jobs_and_ids = list(zip(tag_children, num_jobs_per_child, child_ids))

			processor = Parallel(n_jobs=self._num_jobs, backend='threading', require='sharedmem')
			tags_and_children = processor(
				delayed(self._get_tag_and_child)(
					tag=_tag,
					num_jobs=int(_num_jobs),
					child_id=_child_id,
					extract_tags=extract_tags
				)
				for _tag, _num_jobs, _child_id in tags_and_num_jobs_and_ids
			)

		return tags_and_children

	def parse_children(self, tag_children, extract_tags):

		tags_and_children = self.get_tags_and_children(tag_children=tag_children, extract_tags=extract_tags)

		for tag, child in tags_and_children:
			prev_child = self.last_child

			if child.num_children == 0 and child.string == '':
				# ignore useless children
				child.log("no children and no string", by=self)
				child.delete(by=self)
				continue

			elif child.num_children == 0 and self.ignore_string(string=child.string):
				child.log("no children and ignore string", by=self)
				child.delete(by=self)
				continue

			else:

				if prev_child is None:
					child.parent = self

				else:
					if prev_child.font_size == child.font_size or prev_child.font_size is None or child.font_size is None:
						# absorb if the two children are parents in the same level
						if prev_child.is_only_parent() and child.is_only_parent():
							prev_child.absorb_children(other=child, by=self)

						else:
							child.parent = self

					elif prev_child.font_size > child.font_size:
						child.parent = prev_child

					elif prev_child.font_size < child.font_size:
						child.parent = self

		# if all children are just text join them together
		if all([child.is_just_text() for child in self.children]):
			self.join_text_children()

		# absorb children of the one child
		if self.num_children == 1:
			self.absorb_grand_children()

	def display_string(self, max_width, depth_decrease=0):

		if len(self.string) > 0:
			indentation = '  ' * (self.depth - depth_decrease)
			return wrap(
				text=f'< {self.string_for_display} >',
				max_width=max_width,
				indentation=indentation,
				prefix='|',
				suffix='|'
			)

		else:
			result = ''

		return result

	def display(self, max_width=90, depth_decrease=None):
		"""
		displays the HtmlText and all its children
		:rtype: str
		"""
		if depth_decrease is None:
			depth_decrease = self.depth

		result = self.display_string(max_width=max_width, depth_decrease=depth_decrease)

		for child in self.children:
			child_string = child.display(max_width=max_width, depth_decrease=depth_decrease)
			if len(child_string) > 0 and len(result) > 0:
				result += '\n' + child_string
			else:
				result += child_string

		return result

	def print(self, max_width=90):
		return print(self.display(max_width=max_width))

	def __getitem__(self, item):
		if isinstance(item, int):
			return self.children[item]
		elif isinstance(item, str):
			for child in self.children:
				if child._string.strip().lower().startswith(item.lower()):
					return child
			raise KeyError(f'{item} does not exist!')

	@property
	def record(self):
		"""
		:rtype: dict[str, ]
		"""
		return {
			'id': self.id,
			'parent_id': self.parent.id if self.parent is not None else 0,
			'depth': self.depth,
			'index': self.index,
			'font_size': self.font_size,
			'children_font_size_max': self.children_font_size_max,
			'rank_among_siblings': self.rank_among_siblings,
			'num_siblings': self.num_siblings,
			'num_children': self.num_children,
			'num_characters': len(self.string),
			'num_strings': self._num_strings,
			'text': self.string,
			**self.style
		}

	@property
	def records(self):
		"""

		:rtype: list[dict[str, ]]
		"""
		result = [self.record]
		for child in self.children:
			result += child.records
		return result

	@property
	def simple_records(self):
		if self.string != '':
			result = [{'font_size': self.font_size, 'text': self.string}]
		else:
			result = []

		for child in self.children:
			result += child.simple_records

		return result

bypass 1.0, Devloped By El Moujahidin (the source has been moved and devloped)
Email: contact@elmoujehidin.net bypass 1.0, Devloped By El Moujahidin (the source has been moved and devloped) Email: contact@elmoujehidin.net