import re

import cross
import util.html_util as html_util
import util.util as util

URL = re.compile(r"(?:(?:[A-Za-z][A-Za-z0-9+.-]*://)|mailto:)[^\s]+", re.IGNORECASE)
MD_INLINE_LINK = re.compile(
    r"\[([^\]]+)\]\(\s*((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s\)]+)\s*\)",
    re.IGNORECASE,
)
MD_AUTOLINK = re.compile(
    r"<((?:(?:[A-Za-z][A-Za-z0-9+.\-]*://)|mailto:)[^\s>]+)>", re.IGNORECASE
)
HASHTAG = re.compile(r"(?<!\w)\#([\w]+)")
FEDIVERSE_HANDLE = re.compile(r"(?<![\w@])@([\w\.-]+)(?:@([\w\.-]+\.[\w\.-]+))?")


def tokenize_markdown(
    text: str, tags: list[str], handles: list[tuple[str, str]]
) -> list[cross.Token]:
    if not text:
        return []

    tokenizer = html_util.HTMLPostTokenizer()
    tokenizer.mentions = handles
    tokenizer.tags = tags
    tokenizer.feed(text)
    html_tokens = tokenizer.get_tokens()

    tokens: list[cross.Token] = []

    for tk in html_tokens:
        if isinstance(tk, cross.TextToken):
            tokens.extend(__tokenize_md(tk.text, tags, handles))
        elif isinstance(tk, cross.LinkToken):
            if not tk.label or util.canonical_label(tk.label, tk.href):
                tokens.append(tk)
                continue

            tokens.extend(__tokenize_md(f"[{tk.label}]({tk.href})", tags, handles))
        else:
            tokens.append(tk)

    return tokens


def __tokenize_md(
    text: str, tags: list[str], handles: list[tuple[str, str]]
) -> list[cross.Token]:
    index: int = 0
    total: int = len(text)
    buffer: list[str] = []

    tokens: list[cross.Token] = []

    def flush():
        nonlocal buffer
        if buffer:
            tokens.append(cross.TextToken("".join(buffer)))
            buffer = []

    while index < total:
        if text[index] == "[":
            md_inline = MD_INLINE_LINK.match(text, index)
            if md_inline:
                flush()
                label = md_inline.group(1)
                href = md_inline.group(2)
                tokens.append(cross.LinkToken(href, label))
                index = md_inline.end()
                continue

        if text[index] == "<":
            md_auto = MD_AUTOLINK.match(text, index)
            if md_auto:
                flush()
                href = md_auto.group(1)
                tokens.append(cross.LinkToken(href, href))
                index = md_auto.end()
                continue

        if text[index] == "#":
            tag = HASHTAG.match(text, index)
            if tag:
                tag_text = tag.group(1)
                if tag_text.lower() in tags:
                    flush()
                    tokens.append(cross.TagToken(tag_text))
                    index = tag.end()
                    continue

        if text[index] == "@":
            handle = FEDIVERSE_HANDLE.match(text, index)
            if handle:
                handle_text = handle.group(0)
                stripped_handle = handle_text.strip()

                match = next(
                    (pair for pair in handles if stripped_handle in pair), None
                )

                if match:
                    flush()
                    tokens.append(
                        cross.MentionToken(match[1], "")
                    )  # TODO: misskey doesn’t provide a uri
                    index = handle.end()
                    continue

        url = URL.match(text, index)
        if url:
            flush()
            href = url.group(0)
            tokens.append(cross.LinkToken(href, href))
            index = url.end()
            continue

        buffer.append(text[index])
        index += 1

    flush()
    return tokens