<?php
/**
 * Global AI + Search Engine Refusal Layer
 * 
 * Handles meta tags, HTTP headers, robots.txt management,
 * and active enforcement for AI training and indexing refusal.
 */

namespace BlockAI;

class RefusalLayer {
    
    /**
     * Known AI crawler user agents to block
     */
    private $ai_crawler_agents = [
        'GPTBot',
        'ChatGPT-User',
        'CCBot',
        'anthropic-ai',
        'ClaudeBot',
        'Claude-Web',
        'PerplexityBot',
        'Perplexity',
        'Google-Extended',
        'GoogleOther',
        'Applebot-Extended',
        'Omgilibot',
        'FacebookBot',
        'Bytespider', // TikTok
        'Diffbot',
        'MegaIndex',
        'SemrushBot',
        'AhrefsBot',
    ];
    
    /**
     * ContentControl instance cache
     */
    private $content_control = null;
    
    /**
     * Initialize the refusal layer
     */
    public function init() {
        // Always add hooks so Content Type Control can work independently
        // ContentControl will determine if refusal should apply based on settings
        
        // Add meta tags to head
        add_action('wp_head', [$this, 'add_refusal_meta_tags'], 1);
        
        // Add HTTP headers (early, before any output)
        add_action('send_headers', [$this, 'add_refusal_headers'], 1);
        
        // Manage robots.txt (lower priority to run after other plugins like Yoast SEO)
        // Priority 99 ensures we append after most other plugins
        add_filter('robots_txt', [$this, 'modify_robots_txt'], 99, 2);
        
        // Active enforcement: block known AI crawlers (only if global refusal is enabled)
        $global_refusal = get_option('block_ai_global_refusal', true);
        if ($global_refusal) {
            $enforcement_mode = get_option('block_ai_enforcement_mode', 'signals_only');
            
            if ($enforcement_mode === 'block' || $enforcement_mode === 'challenge') {
                // Check before template redirect (early hook)
                add_action('template_redirect', [$this, 'enforce_ai_blocking'], 1);
                // Also check for init to catch early requests
                add_action('init', [$this, 'enforce_ai_blocking'], 1);
            }
        }
    }
    
    /**
     * Get ContentControl instance (cached)
     */
    private function get_content_control() {
        if ($this->content_control === null) {
            $this->content_control = new ContentControl();
        }
        return $this->content_control;
    }
    
    /**
     * Add refusal meta tags to HTML head
     */
    public function add_refusal_meta_tags() {
        // AI-specific meta tags (only when AI blocking is enabled)
        if ($this->get_content_control()->should_apply_refusal()) {
            // AI training crawlers - specific meta tags
            $ai_bots = [
                'GPTBot',
                'ChatGPT-User',
                'ClaudeBot',
                'Claude-Web',
                'anthropic-ai',
                'PerplexityBot',
                'Perplexity',
                'Google-Extended',
                'CCBot',
                'Amazonbot',
                'OpenAI',
            ];
            
            foreach ($ai_bots as $bot) {
                echo '<meta name="' . esc_attr($bot) . '" content="noindex, nofollow, noarchive" />' . "\n";
            }
            
            // AI training refusal (standard tags)
            echo '<meta name="noai" content="true" />' . "\n";
            echo '<meta name="noimageai" content="true" />' . "\n";
        }
        
        // Search engine blocking (separate setting, respects content type control)
        if ($this->get_content_control()->should_apply_search_engine_blocking()) {
            // Standard robots meta tags (blocks all search engines)
            echo '<meta name="robots" content="noindex, nofollow, noarchive, nosnippet, noimageindex" />' . "\n";
            
            // Search engine bots
            echo '<meta name="googlebot" content="noindex, nofollow, noarchive" />' . "\n";
            echo '<meta name="bingbot" content="noindex, nofollow" />' . "\n";
        }
    }
    
    /**
     * Add refusal HTTP headers
     */
    public function add_refusal_headers() {
        if (headers_sent()) {
            return;
        }
        
        // AI-specific headers (only when AI blocking is enabled)
        if ($this->get_content_control()->should_apply_refusal()) {
            header('X-AI-Training: disallowed');
            header('X-Content-Usage: human-read-only');
        }
        
        // Search engine blocking headers (separate setting, respects content type control)
        if ($this->get_content_control()->should_apply_search_engine_blocking()) {
            header('X-Robots-Tag: noindex, nofollow, noarchive, nosnippet, noimageindex');
        }
    }
    
    /**
     * Modify robots.txt to block AI crawlers
     * Runs with priority 99 to append after other plugins (like Yoast SEO)
     */
    public function modify_robots_txt($output, $public) {
        // Check if robots.txt management is disabled
        if (get_option('block_ai_disable_robots_txt', false)) {
            return $output; // Let other plugins manage it
        }
        
        if (!$public) {
            return $output;
        }
        
        // Check if HumanGate section already exists (prevent duplication)
        if (!empty($output) && strpos($output, '# HumanGate Plugin') !== false) {
            return $output; // Already added, don't duplicate
        }
        
        // Only block AI training crawlers, not search engines
        // Note: Googlebot and Bingbot are search engines, not AI crawlers
        $ai_crawlers = [
            'User-agent: GPTBot',
            'Disallow: /',
            '',
            'User-agent: ChatGPT-User',
            'Disallow: /',
            '',
            'User-agent: ClaudeBot',
            'Disallow: /',
            '',
            'User-agent: Claude-Web',
            'Disallow: /',
            '',
            'User-agent: anthropic-ai',
            'Disallow: /',
            '',
            'User-agent: PerplexityBot',
            'Disallow: /',
            '',
            'User-agent: Perplexity',
            'Disallow: /',
            '',
            'User-agent: CCBot',
            'Disallow: /',
            '',
            'User-agent: Amazonbot',
            'Disallow: /',
            '',
            'User-agent: Google-Extended',
            'Disallow: /',
            '',
            'User-agent: OpenAI',
            'Disallow: /',
        ];
        
        $ai_block = implode("\n", $ai_crawlers);
        
        // Append to existing robots.txt (compatible with Yoast SEO, RankMath, etc.)
        // We append with a clear section marker so users can see what was added
        if (!empty($output)) {
            // Remove trailing whitespace from existing output
            $output = rtrim($output);
            return $output . "\n\n# HumanGate Plugin - AI Crawler Blocks\n" . $ai_block;
        }
        
        // If no existing robots.txt, create a minimal one
        return "User-agent: *\nAllow: /\n\n# HumanGate Plugin - AI Crawler Blocks\n" . $ai_block;
    }
    
    /**
     * Detect if robots.txt managing plugins are active
     */
    public static function detect_robots_txt_plugins() {
        $detected = [];
        
        // Yoast SEO
        if (defined('WPSEO_VERSION') || class_exists('WPSEO_Options')) {
            $detected[] = 'Yoast SEO';
        }
        
        // Rank Math
        if (defined('RANK_MATH_VERSION') || class_exists('RankMath')) {
            $detected[] = 'Rank Math';
        }
        
        // All in One SEO
        if (defined('AIOSEO_VERSION') || class_exists('AIOSEO')) {
            $detected[] = 'All in One SEO';
        }
        
        // SEOPress
        if (defined('SEOPRESS_VERSION') || function_exists('seopress_get_service')) {
            $detected[] = 'SEOPress';
        }
        
        // The SEO Framework
        if (defined('THE_SEO_FRAMEWORK_VERSION') || class_exists('The_SEO_Framework\Load')) {
            $detected[] = 'The SEO Framework';
        }
        
        // Squirrly SEO
        if (defined('SQ_VERSION') || class_exists('SQ_Classes_FrontController')) {
            $detected[] = 'Squirrly SEO';
        }
        
        // SEO Ultimate
        if (defined('SU_VERSION') || class_exists('SEO_Ultimate')) {
            $detected[] = 'SEO Ultimate';
        }
        
        // WP Meta SEO
        if (defined('WPMS_VERSION') || class_exists('WP_Meta_SEO')) {
            $detected[] = 'WP Meta SEO';
        }
        
        return $detected;
    }
    
    /**
     * Actively enforce blocking of known AI crawlers
     */
    public function enforce_ai_blocking() {
        // Skip for admin, ajax, cron, and logged-in users
        if (is_admin() || wp_doing_ajax() || wp_doing_cron()) {
            return;
        }
        
        // Allow logged-in users to pass through
        if (is_user_logged_in()) {
            return;
        }
        
        // Sanitize user agent (prevent injection)
        $user_agent = isset($_SERVER['HTTP_USER_AGENT']) ? sanitize_text_field(wp_unslash($_SERVER['HTTP_USER_AGENT'])) : '';
        
        // Validate user agent length (prevent DoS via huge user agent)
        if (empty($user_agent) || strlen($user_agent) > 512) {
            return;
        }
        
        // Check if user agent matches known AI crawlers
        $is_ai_crawler = false;
        $matched_agent = '';
        
        foreach ($this->ai_crawler_agents as $agent) {
            if (stripos($user_agent, $agent) !== false) {
                $is_ai_crawler = true;
                $matched_agent = $agent;
                break;
            }
        }
        
        if (!$is_ai_crawler) {
            return;
        }
        
        // Verify search engine indexing bots (Googlebot, Bingbot) if verification is enabled
        // This allows search engine indexing while blocking AI training crawlers
        if ($this->should_verify_search_engine_bot($user_agent)) {
            if ($this->verify_search_engine_bot($user_agent)) {
                return; // Verified search engine bot, allow for indexing
            }
        }
        
        $enforcement_mode = get_option('block_ai_enforcement_mode', 'signals_only');
        
        // Log the block attempt
        $telemetry = new Telemetry();
        $telemetry->increment_blocked('ai_crawler_blocked');
        
        if ($enforcement_mode === 'block') {
            // Hard block: Return 403 Forbidden
            $this->send_block_response($matched_agent);
        } elseif ($enforcement_mode === 'challenge') {
            // Soft block: Redirect to challenge (if friction system is available)
            $this->redirect_to_challenge('ai_crawler');
        }
    }
    
    /**
     * Check if we should verify search engine indexing bots
     * These are bots for search engine indexing (Google, Bing, etc.), NOT AI training crawlers
     */
    private function should_verify_search_engine_bot($user_agent) {
        // Only verify if option is enabled and it's a known search engine bot
        if (!get_option('block_ai_verify_search_engine_bots', false)) {
            return false;
        }
        
        // Search engine indexing bots (for search results, NOT AI training)
        $search_engine_patterns = [
            'Googlebot',      // Google Search
            'Bingbot',        // Bing Search
            'Slurp',          // Yahoo Search
            'DuckDuckBot',    // DuckDuckGo
            'Baiduspider',    // Baidu
            'YandexBot',      // Yandex
            'Sogou',          // Sogou
        ];
        
        foreach ($search_engine_patterns as $pattern) {
            if (stripos($user_agent, $pattern) !== false) {
                return true;
            }
        }
        
        return false;
    }
    
    /**
     * Verify search engine bot by reverse DNS lookup
     * Verifies the bot is actually from the claimed search engine (prevents spoofing)
     * Note: This is expensive (DNS lookup), so only use when option is enabled
     */
    private function verify_search_engine_bot($user_agent) {
        $ip = $this->get_client_ip();
        
        if (!$ip) {
            return false;
        }
        
        // Additional validation: ensure IP is public
        if (!filter_var($ip, FILTER_VALIDATE_IP, FILTER_FLAG_NO_PRIV_RANGE | FILTER_FLAG_NO_RES_RANGE)) {
            return false;
        }
        
        // Reverse DNS lookup (expensive operation - can take 100-500ms)
        // This verifies the IP actually belongs to the claimed search engine
        $hostname = @gethostbyaddr($ip);
        
        if ($hostname === false || $hostname === $ip) {
            return false;
        }
        
        // Verify hostname matches expected pattern for each search engine
        if (stripos($user_agent, 'Googlebot') !== false) {
            // Googlebot should reverse resolve to googlebot.com or google.com
            return (bool) preg_match('/\.google(bot)?\.com$/i', $hostname);
        }
        
        if (stripos($user_agent, 'Bingbot') !== false) {
            // Bingbot should reverse resolve to search.msn.com
            return (bool) preg_match('/\.search\.msn\.com$/i', $hostname);
        }
        
        if (stripos($user_agent, 'Slurp') !== false) {
            // Yahoo Slurp should reverse resolve to yahoo.net or yahoo.com
            return (bool) preg_match('/\.yahoo\.(net|com)$/i', $hostname);
        }
        
        if (stripos($user_agent, 'DuckDuckBot') !== false) {
            // DuckDuckBot should reverse resolve to duckduckgo.com
            return (bool) preg_match('/\.duckduckgo\.com$/i', $hostname);
        }
        
        // For other search engines, we can add verification patterns as needed
        // For now, allow them if they pass the basic DNS check (hostname != IP)
        return true;
    }
    
    /**
     * Send 403 Forbidden response to blocked crawler
     */
    private function send_block_response($matched_agent) {
        http_response_code(403);
        header('X-Robots-Tag: noindex, nofollow');
        header('X-AI-Training: disallowed');
        header('Retry-After: 3600');
        
        // Send minimal HTML response
        echo '<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <meta name="robots" content="noindex, nofollow">
    <title>Access Denied</title>
    <style>
        body {
            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
            display: flex;
            justify-content: center;
            align-items: center;
            min-height: 100vh;
            margin: 0;
            background: #f5f5f5;
            color: #333;
        }
        .container {
            text-align: center;
            padding: 2rem;
        }
        h1 { color: #d63638; }
    </style>
</head>
<body>
    <div class="container">
        <h1>403 - Access Denied</h1>
        <p>AI training crawlers are not permitted on this site.</p>
        <p><small>Detected: ' . esc_html($matched_agent) . '</small></p>
    </div>
</body>
</html>';
        
        exit;
    }
    
    /**
     * Get and validate client IP address
     */
    private function get_client_ip() {
        $ip_keys = [
            'HTTP_CF_CONNECTING_IP',     // Cloudflare
            'HTTP_X_REAL_IP',            // Nginx proxy
            'HTTP_X_FORWARDED_FOR',      // Standard proxy header
            'REMOTE_ADDR',                // Standard
        ];
        
        foreach ($ip_keys as $key) {
            if (!isset($_SERVER[$key])) {
                continue;
            }
            
            $ip = sanitize_text_field(wp_unslash($_SERVER[$key]));
            
            // Handle X-Forwarded-For (can contain multiple IPs)
            if ($key === 'HTTP_X_FORWARDED_FOR') {
                $ips = explode(',', $ip);
                $ip = trim($ips[0]); // Take first IP (original client)
            }
            
            // Validate IP format
            if (filter_var($ip, FILTER_VALIDATE_IP)) {
                return $ip;
            }
        }
        
        return false;
    }
    
    /**
     * Redirect to challenge system (requires SelectiveFriction to be enabled)
     */
    private function redirect_to_challenge($reason) {
        // For challenge mode, we'll use the same challenge page mechanism
        // but we need to check if friction is enabled first
        if (!get_option('block_ai_friction_enabled', true)) {
            // Fallback to hard block if friction not enabled
            $this->send_block_response('AI Crawler');
            return;
        }
        
        // Get client IP securely
        $ip = $this->get_client_ip();
        if (!$ip) {
            // Can't verify IP, deny access
            $this->send_block_response('AI Crawler');
            return;
        }
        
        // Generate challenge token and nonce
        $challenge_token = wp_create_nonce('block_ai_challenge');
        $challenge_nonce = wp_create_nonce('humangate_challenge');
        set_transient('block_ai_challenge_' . $challenge_token, $ip, 300);
        
        // Serve challenge page (same mechanism as friction system)
        http_response_code(200);
        header('Content-Type: text/html; charset=UTF-8');
        
        // Escape URL and nonce for JavaScript context
        $verify_url = esc_js(admin_url('admin-ajax.php?action=block_ai_verify_challenge&token=' . esc_attr($challenge_token)));
        
        // Prepare translated strings
        $title_text = esc_html__('Verifying...', 'humangate');
        $verifying_text = esc_html__('Verifying browser...', 'humangate');
        $failed_text = esc_html__('Verification failed. Please try again.', 'humangate');
        $error_text = esc_html__('Verification error. Please refresh the page.', 'humangate');
        
        // Reuse the same challenge page HTML from friction system
        echo '<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta name="robots" content="noindex, nofollow">
    <title>' . esc_html($title_text) . '</title>
    <style>
        body {
            font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
            display: flex;
            justify-content: center;
            align-items: center;
            min-height: 100vh;
            margin: 0;
            background: #f5f5f5;
            color: #333;
        }
        .container {
            text-align: center;
            padding: 2rem;
        }
        .spinner {
            border: 3px solid #f3f3f3;
            border-top: 3px solid #333;
            border-radius: 50%;
            width: 40px;
            height: 40px;
            animation: spin 1s linear infinite;
            margin: 0 auto 1rem;
        }
        @keyframes spin {
            0% { transform: rotate(0deg); }
            100% { transform: rotate(360deg); }
        }
    </style>
</head>
<body>
    <div class="container">
        <div class="spinner"></div>
        <p>' . esc_html($verifying_text) . '</p>
    </div>
    <script>
        (function() {
            function generateToken() {
                const data = {
                    timestamp: Date.now(),
                    random: Math.random().toString(36).substring(2),
                    performance: performance.now(),
                    screen: window.screen.width + "x" + window.screen.height,
                    timezone: Intl.DateTimeFormat().resolvedOptions().timeZone,
                    languages: navigator.languages.join(",")
                };
                const str = JSON.stringify(data);
                let hash = 0;
                for (let i = 0; i < str.length; i++) {
                    const char = str.charCodeAt(i);
                    hash = ((hash << 5) - hash) + char;
                    hash = hash & hash;
                }
                return btoa(str).substring(0, 32) + Math.abs(hash).toString(36);
            }
            
            const token = generateToken();
            const verifyUrl = ' . wp_json_encode($verify_url) . ';
            const challengeNonce = ' . wp_json_encode($challenge_nonce) . ';
            
            fetch(verifyUrl, {
                method: "POST",
                headers: {
                    "Content-Type": "application/x-www-form-urlencoded",
                },
                body: "browser_token=" + encodeURIComponent(token) + "&humangate_challenge_nonce=" + encodeURIComponent(challengeNonce)
            })
            .then(response => response.json())
            .then(data => {
                if (data.success) {
                    window.location.reload();
                } else {
                    document.querySelector(".container p").textContent = ' . wp_json_encode($failed_text) . ';
                }
            })
            .catch(error => {
                document.querySelector(".container p").textContent = ' . wp_json_encode($error_text) . ';
            });
        })();
    </script>
</body>
</html>';
        
        exit;
    }
}

