import fitz  # PyMuPDF
import random
import re


def extract_sentences_with_coords(page):

    sentences = []
    text_dict = page.get_text("dict")
    blocks = text_dict["blocks"]

    for block in blocks:
        if block["type"] == 0:
            spans = []
            for line in block["lines"]:
                for span in line["spans"]:
                    spans.append({
                        "text": span["text"],
                        "bbox": fitz.Rect(span["bbox"])
                    })

            merged_text = ""
            spans_info = []
            current_index = 0
            for span in spans:
                text = span["text"]
                merged_text += text
                span_length = len(text)
                spans_info.append({
                    "start": current_index,
                    "end": current_index + span_length - 1,
                    "bbox": span["bbox"]
                })
                current_index += span_length

            for match in re.finditer(r'[^.!?]*[.!?]', merged_text):
                s_start, s_end = match.start(), match.end()
                sentence_text = match.group().strip()
                if not sentence_text:
                    continue

                sentence_rect = None
                for span_info in spans_info:
                    if span_info["end"] < s_start:
                        continue
                    if span_info["start"] > s_end:
                        break

                    if sentence_rect is None:
                        sentence_rect = span_info["bbox"]
                    else:
                        sentence_rect |= span_info["bbox"] 

                if sentence_rect:
                    sentences.append((sentence_text, sentence_rect))

    return sentences


def random_highlight_sentences(pdf_path, output_path, highlight_percentage=0.1, pages_to_highlight=None):

    doc = fitz.open(pdf_path)

    if pages_to_highlight is None:
        pages_to_highlight = list(range(len(doc)))

    for page_num in range(len(doc)):
        if page_num not in pages_to_highlight:
            continue

        page = doc[page_num]
        sentences = extract_sentences_with_coords(page)

        if not sentences:
            continue

        num_to_highlight = max(1, int(len(sentences) * highlight_percentage))
        sentences_to_highlight = random.sample(sentences, num_to_highlight)

        colors = [
            (0.0, 1.0, 1.0),  
            (1.0, 0.5, 0.0),  
            (0.0, 1.0, 0.0)   #此处按需修改颜色
        ]

        for sentence, rect in sentences_to_highlight:
            if rect is not None:
                color = random.choice(colors)
                highlight = page.add_highlight_annot(rect)
                highlight.set_colors({"stroke": color, "fill": color})
                highlight.update()

        if random.random() < 0.5:
            add_red_marks(page)

    doc.save(output_path)
    doc.close()


def add_red_marks(page, font_size=18):

    marks = ["*", "~", "!", "?", "公式", "重点"]
    red_color = (1.0, 0.0, 0.0)

    mark = random.choice(marks)

    page_rect = page.rect
    x = random.uniform(page_rect.x0 + 50, page_rect.x1 - 50)
    y = random.uniform(page_rect.y0 + 50, page_rect.y1 - 50)

    text_rect = fitz.Rect(x, y, x + 50, y + 20)

    page.insert_text((x, y), mark, fontsize=font_size, color=red_color)
    page.draw_rect(text_rect, color=red_color, fill=red_color, overlay=False)


if __name__ == "__main__":
    input_pdf = r"your_path"
    output_pdf = r"your_path"

    page_ranges = [
        (908, 1000),
        #此处按需修改
    ]

    doc = fitz.open(input_pdf)
    total_pages = len(doc)
    doc.close()

    pages_to_highlight = []
    for start, end in page_ranges:
        start = max(0, start)
        end = min(total_pages - 1, end)
        if start <= end:
            pages_to_highlight.extend(range(start, end + 1))

    pages_to_highlight = sorted(list(set(pages_to_highlight)))

    random_highlight_sentences(input_pdf, output_pdf, 0.56, pages_to_highlight)
    print(f"已成功高亮句子并保存到 {output_pdf}")