Welcome To Our Shell

Mister Spy & Souheyl Bypass Shell

Current Path : /proc/thread-self/root/usr/local/lib/python3.8/dist-packages/tests/

Linux ift1.ift-informatik.de 5.4.0-216-generic #236-Ubuntu SMP Fri Apr 11 19:53:21 UTC 2025 x86_64

Current File : //proc/thread-self/root/usr/local/lib/python3.8/dist-packages/tests/test_basic.py

#!/usr/bin/env python
import os
from subprocess import PIPE, Popen
from typing import Optional

from bs4 import BeautifulSoup
from bs4.element import Tag
from shapely.geometry import box

import pdftotree


# Adapted from https://github.com/ocropus/hocr-tools/blob/v1.3.0/hocr-check
def get_prop(node: Tag, name: str) -> Optional[str]:
    title = node.get("title")
    if not title:
        return None
    props = title.split(";")
    for prop in props:
        (key, args) = prop.split(None, 1)
        if key == name:
            return args
    return None


# Adapted from https://github.com/ocropus/hocr-tools/blob/v1.3.0/hocr-check
def get_bbox(node: Tag) -> box:
    bbox = get_prop(node, "bbox")
    if not bbox:
        return None
    return box(*[int(x) for x in bbox.split()])


def test_heuristic_completion():
    """Simply test that parse runs to completion without errors."""
    output = pdftotree.parse("tests/input/paleo.pdf")
    assert output is not None


def test_cli_should_output_at_given_path(tmp_path):
    """Test if CLI produces an HTML at a given path."""
    html_path = os.path.join(tmp_path, "paleo.html")
    pdftotree.parse("tests/input/paleo.pdf", html_path)
    assert os.path.isfile(html_path)


def test_output_should_conform_to_hocr(tmp_path):
    """Test if an exported file conform to hOCR."""
    html_path = os.path.join(tmp_path, "md.html")
    pdftotree.parse("tests/input/md.pdf", html_path)
    with Popen(["hocr-check", html_path], stderr=PIPE) as proc:
        assert all([line.decode("utf-8").startswith("ok") for line in proc.stderr])


def test_visualize_output(tmp_path):
    """Test if an output can be visualzied."""
    html_path = os.path.join(tmp_path, "md.html")
    pdftotree.parse("tests/input/md.pdf", html_path, visualize=True)


def test_looks_scanned():
    """Test on a PDF that looks like a scanned one but not.

    CaseStudy_ACS.pdf contains a transparent image overlaying the entire page.
    This overlaying transparent image fools TreeExtractor into thinking it is scanned.
    """
    output = pdftotree.parse("tests/input/CaseStudy_ACS.pdf")
    soup = BeautifulSoup(output, "lxml")
    assert len(soup.find_all(class_="ocrx_word")) >= 1000
    assert len(soup.find_all("figure")) == 3

    # Check if words are extracted even though they are overlapped by a figure (#77).
    page = soup.find(class_="ocr_page")  # checking only 1st page is good enough.
    words = [get_bbox(word) for word in page.find_all(class_="ocrx_word")]
    figure = get_bbox(page.find("figure"))
    assert all([figure.contains(word) for word in words])


def test_LTChar_under_LTFigure(tmp_path):
    """Test on a PDF where LTChar(s) are children of LTFigure."""
    html_path = os.path.join(tmp_path, "paleo.html")
    pdftotree.parse("tests/input/CentralSemiconductorCorp_2N4013.pdf", html_path)
    with open(html_path) as f:
        soup = BeautifulSoup(f, "lxml")
    line: Tag = soup.find(class_="ocrx_line")
    assert [word.text for word in line.find_all(class_="ocrx_word")] == [
        "Small",
        "Signal",
        "Transistors",
    ]

    # The table in the 1st page should contain 18 columns
    page = soup.find(class_="ocr_page")
    table = page.find(class_="ocr_table")
    assert len(table.find("tr").find_all("td")) == 18
    assert get_bbox(table) is not None

    # Find a cell containing one or more of ocrx_word and check if it has bbox
    cell = table.find(class_="ocrx_word").parent.parent
    assert get_bbox(cell) is not None

    with Popen(["hocr-check", html_path], stderr=PIPE) as proc:
        assert all([line.decode("utf-8").startswith("ok") for line in proc.stderr])


def test_ml_completion():
    """Simply test that ML-based parse runs without errors."""
    output = pdftotree.parse(
        "tests/input/paleo.pdf",
        model_type="ml",
        model_path="tests/input/paleo_model.pkl",
    )
    assert output is not None


def test_vision_completion():
    """Simply test that vision-based parse runs without errors."""
    output = pdftotree.parse(
        "tests/input/paleo.pdf",
        model_type="vision",
        model_path="tests/input/paleo_visual_model.h5",
    )
    soup = BeautifulSoup(output, "lxml")
    assert len(soup.find_all("table")) == 2

bypass 1.0, Devloped By El Moujahidin (the source has been moved and devloped)
Email: contact@elmoujehidin.net bypass 1.0, Devloped By El Moujahidin (the source has been moved and devloped) Email: contact@elmoujehidin.net