import { isProbablyReaderable, Readability } from "@mozilla/readability"
import TurndownService from "turndown"
import { LinkApi } from "../../api"
import { youtubeMetaService, youtubeService } from "../youtube"
import { Transcript, youtubeTranscriptService } from "../youtube/transcript"
import { generateMarkdown, groupSentences } from "./youtube"
import { parseUrl } from "../../utils"

interface PageMetadata {
    title: string | null
    description: string | null
    image: string | null
}

export interface Content {
    name: string
    image?: string
    description?: string
    markdown: string
    links?: LinkApi[]
    isReadable: boolean
    pdfFile?: File
}

const extractMetadata = (doc: Document): PageMetadata => {
    const metadata: PageMetadata = {
        title: null,
        description: null,
        image: null,
    }

    metadata.title =
        doc.querySelector('meta[property="og:title"]')?.getAttribute("content") ||
        doc.querySelector('meta[name="twitter:title"]')?.getAttribute("content") ||
        doc.querySelector('meta[name="title"]')?.getAttribute("content") ||
        doc.querySelector("title")?.textContent?.trim() ||
        doc.title ||
        null

    metadata.description =
        doc.querySelector('meta[property="og:description"]')?.getAttribute("content") ||
        doc.querySelector('meta[name="twitter:description"]')?.getAttribute("content") ||
        doc.querySelector('meta[name="description"]')?.getAttribute("content") ||
        doc.querySelector("p")?.textContent?.trim() ||
        null

    metadata.image =
        doc.querySelector('meta[property="og:image"]')?.getAttribute("content") ||
        doc.querySelector('meta[name="twitter:image"]')?.getAttribute("content") ||
        doc.querySelector('meta[itemprop="image"]')?.getAttribute("content") ||
        doc.querySelector("article img")?.getAttribute("src") ||
        doc.querySelector("img")?.getAttribute("src") ||
        null

    return metadata
}

const removeFirstImageIfNoTextBefore = (contentDiv: HTMLDivElement) => {
    const firstContentImage = contentDiv.querySelector("img")
    if (!firstContentImage) return

    let hasTextBeforeImage = false
    let node = firstContentImage.previousSibling
    while (node) {
        if (node.nodeType === Node.TEXT_NODE && node.textContent?.trim()) {
            hasTextBeforeImage = true
            break
        }
        node = node.previousSibling
    }

    if (!hasTextBeforeImage) {
        firstContentImage.remove()
    }
}

const removeDuplicateImages = (contentDiv: HTMLDivElement) => {
    const images = contentDiv.querySelectorAll("img")
    const seenSrcs = new Set<string>()

    images.forEach((img) => {
        const src = img.getAttribute("src")
        if (!src) return

        const baseUrl = src.split("?")[0]

        if (seenSrcs.has(baseUrl)) {
            img.remove()
        } else {
            seenSrcs.add(baseUrl)
        }
    })
}

const cleanupContent = (contentDiv: HTMLDivElement) => {
    const selectorsToRemove = [
        "script",
        "style",
        "iframe",
        "nav",
        "header",
        "footer",
        ".advertisement",
        ".social-share",
        ".comments",
        ".related-articles",
        "aside",
        "noscript",
        "meta",
        "[aria-hidden='true']",
        "[role='banner']",
        "[role='navigation']",
        "[role='complementary']",
    ]

    selectorsToRemove.forEach((selector) => {
        contentDiv.querySelectorAll(selector).forEach((el) => el.remove())
    })

    // Remove empty elements and those with minimal content
    contentDiv.querySelectorAll("*").forEach((el) => {
        const text = el.textContent?.trim() || ""
        if (!text || text.length < 5) {
            el.remove()
        }
    })

    // Flatten nested divs with single children
    contentDiv.querySelectorAll("div").forEach((div) => {
        if (div.children.length === 1 && div.children[0].tagName === "DIV") {
            div.replaceWith(div.children[0])
        }
    })
}

const getBaseUrl = (url: string): string => {
    try {
        const parsedUrl = new URL(url)
        return parsedUrl.origin
    } catch (error) {
        return ""
    }
}

const addBaseTag = (doc: Document, originalUrl: string) => {
    const existingBaseUrl = doc.querySelector("base")
    if (existingBaseUrl) return

    const baseUrl = getBaseUrl(originalUrl)
    if (!baseUrl) return

    const base = doc.createElement("base")
    base.href = baseUrl

    const head = doc.querySelector("head")
    if (head) {
        head.prepend(base)
    }
}

const parseDOM = (doc: Document, url: string): Content | null => {
    const documentClone = doc.cloneNode(true) as Document
    addBaseTag(documentClone, url)

    const isReadable = isProbablyReaderable(doc)

    const contentDiv = document.createElement("div")

    if (isReadable) {
        const article = new Readability(documentClone).parse()
        if (article) {
            contentDiv.innerHTML = article.content
        }
    }

    if (!contentDiv.innerHTML) {
        contentDiv.innerHTML = documentClone.body.innerHTML
    }

    cleanupContent(contentDiv)
    removeFirstImageIfNoTextBefore(contentDiv)
    removeDuplicateImages(contentDiv)

    const turndownService = new TurndownService({
        headingStyle: "atx",
        bulletListMarker: "-",
        codeBlockStyle: "fenced",
    })

    const markdown = turndownService.turndown(contentDiv.innerHTML)

    const metadata = extractMetadata(doc)

    return {
        name: metadata.title || "Untitled",
        image: metadata.image || undefined,
        description: metadata.description || undefined,
        markdown,
        isReadable,
    }
}

const getSectionsMarkdown = (
    url: string,
    html: string,
    transcript: Transcript[],
    language: string
) => {
    const sections = youtubeTranscriptService.fetchSections(html, language, transcript)
    if (!sections) return getTranscriptMarkdown(url, transcript)
    return generateMarkdown(url, sections)
}

const getTranscriptMarkdown = (url: string, transcript: Transcript[]) => {
    const groupedSentences = groupSentences(
        url,
        youtubeTranscriptService.formatTranscript(transcript)
    )
    if (!groupedSentences) return null

    let markdown = ""

    groupedSentences.forEach((group) => {
        group.forEach((line) => {
            markdown += ` ${line}`
        })
        markdown += `\n`
    })

    return markdown
}

const parseYoutubeVideo = async (
    url: string,
    html: string,
    transcript: Transcript[],
    language: string
): Promise<Content | null> => {
    const videoId = youtubeService.getVideoId(url)
    if (!videoId) return null

    const metaData = youtubeMetaService.get(html, videoId)

    try {
        const markdown = getSectionsMarkdown(url, html, transcript, language)
        if (!markdown) return null

        return {
            name: metaData.name,
            image: metaData.imageUrl,
            description: metaData.description,
            markdown,
            isReadable: true,
        }
    } catch {
        const markdown = getTranscriptMarkdown(url, transcript)
        if (!markdown) return null

        return {
            name: metaData.name,
            image: metaData.imageUrl,
            description: metaData.description,
            markdown,
            isReadable: false,
        }
    }
}

export const contentService = {
    parseDOM,
    parseYoutubeVideo,
}
