<?php

if ( ! defined( 'ABSPATH' ) ) {
	exit;
}

class WP_AIW_Url_Fetcher {
	/**
	 * @return array{ok:bool, data?:array{url:string,title:string,text:string}, error?:string}
	 */
	public function fetch( $url, $timeout_seconds = 15, $max_chars = 12000 ) {
		$url = $this->normalize_and_validate_url( $url );
		if ( $url === '' ) {
			return array( 'ok' => false, 'error' => 'URL 不合法或不允许抓取' );
		}

		$timeout_seconds = max( 3, min( 60, (int) $timeout_seconds ) );
		$max_chars = max( 1000, min( 100000, (int) $max_chars ) );

		$cache_key = 'wp_aiw_url_' . md5( $url );
		$cached = get_transient( $cache_key );
		if ( is_array( $cached ) && isset( $cached['url'], $cached['text'] ) ) {
			return array( 'ok' => true, 'data' => $cached );
		}

		$args = array(
			'timeout' => $timeout_seconds,
			'redirection' => 3,
			'user-agent' => 'WP-AIW-Writer/' . ( defined( 'WP_AIW_VERSION' ) ? WP_AIW_VERSION : 'dev' ),
			'reject_unsafe_urls' => true,
			'headers' => array(
				'Accept' => 'text/html, text/plain;q=0.9, */*;q=0.1',
			),
			'limit_response_size' => 1024 * 1024,
		);

		$response = wp_safe_remote_get( $url, $args );
		if ( is_wp_error( $response ) ) {
			return array( 'ok' => false, 'error' => $response->get_error_message() );
		}

		$code = (int) wp_remote_retrieve_response_code( $response );
		if ( $code < 200 || $code >= 300 ) {
			return array( 'ok' => false, 'error' => '抓取失败（HTTP ' . $code . '）' );
		}

		$body = wp_remote_retrieve_body( $response );
		if ( ! is_string( $body ) || trim( $body ) === '' ) {
			return array( 'ok' => false, 'error' => '抓取内容为空' );
		}

		$ct = (string) wp_remote_retrieve_header( $response, 'content-type' );
		$title = '';
		$text = '';

		if ( stripos( $ct, 'text/html' ) !== false || $this->looks_like_html( $body ) ) {
			$extracted = $this->extract_from_html( $body );
			$title = $extracted['title'];
			$text = $extracted['text'];
		} else {
			$text = $this->cleanup_text( $body );
		}

		$text = $this->truncate_text( $text, $max_chars );
		if ( $text === '' ) {
			return array( 'ok' => false, 'error' => '抓取内容无法提取有效文本' );
		}

		$data = array(
			'url' => $url,
			'title' => $title,
			'text' => $text,
		);

		set_transient( $cache_key, $data, 10 * MINUTE_IN_SECONDS );

		return array( 'ok' => true, 'data' => $data );
	}

	private function normalize_and_validate_url( $url ) {
		$url = is_string( $url ) ? trim( $url ) : '';
		if ( $url === '' ) {
			return '';
		}

		if ( function_exists( 'wp_http_validate_url' ) ) {
			$validated = wp_http_validate_url( $url );
			if ( ! $validated ) {
				return '';
			}
			$url = (string) $validated;
		}

		$parts = wp_parse_url( $url );
		if ( ! is_array( $parts ) ) {
			return '';
		}

		$scheme = isset( $parts['scheme'] ) ? strtolower( (string) $parts['scheme'] ) : '';
		if ( ! in_array( $scheme, array( 'http', 'https' ), true ) ) {
			return '';
		}

		$host = isset( $parts['host'] ) ? strtolower( (string) $parts['host'] ) : '';
		if ( $host === '' ) {
			return '';
		}

		// Basic host blocks
		if ( $host === 'localhost' || $host === '127.0.0.1' || $host === '0.0.0.0' ) {
			return '';
		}

		// Basic DNS resolution check to reduce SSRF risk.
		$ip = gethostbyname( $host );
		if ( is_string( $ip ) && $ip !== '' && $ip !== $host && filter_var( $ip, FILTER_VALIDATE_IP ) ) {
			if ( filter_var( $ip, FILTER_VALIDATE_IP, FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE ) === false ) {
				return '';
			}
		}

		return esc_url_raw( $url );
	}

	private function looks_like_html( $body ) {
		$body = is_string( $body ) ? $body : '';
		return ( stripos( $body, '<html' ) !== false || stripos( $body, '<body' ) !== false || stripos( $body, '<!doctype' ) !== false );
	}

	/**
	 * @return array{title:string,text:string}
	 */
	private function extract_from_html( $html ) {
		$html = is_string( $html ) ? $html : '';
		$title = '';
		$text = '';

		if ( class_exists( 'DOMDocument' ) ) {
			$prev = libxml_use_internal_errors( true );
			$dom = new DOMDocument();
			$loaded = $dom->loadHTML( $html, LIBXML_NOWARNING | LIBXML_NOERROR );
			libxml_clear_errors();
			libxml_use_internal_errors( $prev );

			if ( $loaded ) {
				$xpath = new DOMXPath( $dom );
				foreach ( array( '//script', '//style', '//noscript' ) as $q ) {
					$nodes = $xpath->query( $q );
					if ( $nodes ) {
						foreach ( $nodes as $node ) {
							if ( $node && $node->parentNode ) {
								$node->parentNode->removeChild( $node );
							}
						}
					}
				}

				$title_node = $xpath->query( '//title' );
				if ( $title_node && $title_node->length > 0 ) {
					$title = trim( (string) $title_node->item( 0 )->textContent );
				}

				$text = $this->cleanup_text( (string) $dom->textContent );
			}
		}

		if ( $text === '' ) {
			$text = $this->cleanup_text( wp_strip_all_tags( $html ) );
		}

		return array(
			'title' => $title,
			'text' => $text,
		);
	}

	private function cleanup_text( $text ) {
		$text = is_string( $text ) ? $text : '';
		$text = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5, 'UTF-8' );
		$text = preg_replace( '/\s+/u', ' ', $text );
		$text = trim( (string) $text );
		return $text;
	}

	private function truncate_text( $text, $max_chars ) {
		$text = is_string( $text ) ? trim( $text ) : '';
		$max_chars = (int) $max_chars;
		if ( $text === '' ) {
			return '';
		}
		if ( $max_chars <= 0 ) {
			return $text;
		}
		if ( function_exists( 'mb_strlen' ) && function_exists( 'mb_substr' ) ) {
			if ( mb_strlen( $text, 'UTF-8' ) > $max_chars ) {
				return mb_substr( $text, 0, $max_chars, 'UTF-8' );
			}
			return $text;
		}
		if ( strlen( $text ) > $max_chars ) {
			return substr( $text, 0, $max_chars );
		}
		return $text;
	}
}
