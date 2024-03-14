Crawler

A crawler (also known as a spider or bot) is an automated program that systematically browses and indexes web pages. Search engines use crawlers to discover, analyze, and index web content, making it findable in search results.

# Basic Crawler Implementation class WebCrawler: def __init__(self): self.visited_urls = set() self.url_queue = [] self.robots_cache = {} async def crawl(self, start_url): if not self.is_allowed(start_url): return self.url_queue.append(start_url) while self.url_queue: url = self.url_queue.pop(0) if url in self.visited_urls: continue try: content = await self.fetch_page(url) self.process_page(content) self.extract_links(content) self.visited_urls.add(url) except Exception as e: self.log_error(url, e) def is_allowed(self, url): # Check robots.txt rules return self.check_robots_txt(url)

# Robots.txt Parser class RobotsParser: def parse_robots_txt(self, content): rules = { 'allow': [], 'disallow': [], 'crawl_delay': None, 'sitemap': [] } for line in content.split('

'): if line.startswith('Allow:'): rules['allow'].append(line.split(':', 1)[1].strip()) elif line.startswith('Disallow:'): rules['disallow'].append(line.split(':', 1)[1].strip()) elif line.startswith('Crawl-delay:'): rules['crawl_delay'] = float(line.split(':', 1)[1].strip()) elif line.startswith('Sitemap:'): rules['sitemap'].append(line.split(':', 1)[1].strip()) return rules

<!-- Basic Meta Robots Tags --> < meta name = "robots" content = "index, follow" > < meta name = "robots" content = "noindex, nofollow" > < meta name = "robots" content = "noarchive" > <!-- Specific Crawler Directives --> < meta name = "googlebot" content = "index, follow" > < meta name = "bingbot" content = "noindex" > < meta name = "googlebot-news" content = "noindex" >

# Nginx X-Robots-Tag Configuration location /private/ { add_header X-Robots-Tag "noindex, nofollow"; } location /temporary/ { add_header X-Robots-Tag "noarchive"; } location /beta/ { add_header X-Robots-Tag "noindex, nofollow, noarchive"; }

<!-- XML Sitemap Example --> <?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>https://example.com/</loc> <lastmod>2024-03-15</lastmod> <changefreq>daily</changefreq> <priority>1.0</priority> </url> <url> <loc>https://example.com/products</loc> <lastmod>2024-03-14</lastmod> <changefreq>weekly</changefreq> <priority>0.8</priority> </url> </urlset>

// Crawl Rate Controller class CrawlRateManager { constructor ( maxRequestsPerSecond ) { this .maxRequestsPerSecond = maxRequestsPerSecond; this .requestQueue = []; this .lastRequestTime = Date. now (); } async scheduleRequest ( url ) { const now = Date. now (); const timeSinceLastRequest = now - this .lastRequestTime; const minimumInterval = 1000 / this .maxRequestsPerSecond; if (timeSinceLastRequest < minimumInterval) { await new Promise ( resolve => setTimeout (resolve, minimumInterval - timeSinceLastRequest) ); } this .lastRequestTime = Date. now (); return this . makeRequest (url); } }

<!-- Resource Hints for Crawlers --> < link rel = "preload" href = "/assets/critical.css" as = "style" > < link rel = "preload" href = "/assets/main.js" as = "script" > < link rel = "dns-prefetch" href = "//cdn.example.com" > < link rel = "preconnect" href = "https://api.example.com" >

# Nginx Crawler Optimization location / { # Gzip compression gzip on; gzip_types text/plain text/html text/css application/javascript; # Cache control expires 1h; add_header Cache-Control "public, no-transform"; # Crawler specific settings if ($http_user_agent ~* (googlebot|bingbot)) { set $crawl_rate "slow"; } }

# Crawler Log Analyzer class CrawlerLogAnalyzer: def analyze_logs(self, log_file): crawler_stats = { 'googlebot': {'requests': 0, 'bytes': 0}, 'bingbot': {'requests': 0, 'bytes': 0}, 'errors': [] } for line in log_file: if 'Googlebot' in line: self.process_googlebot_entry(line, crawler_stats) elif 'bingbot' in line: self.process_bingbot_entry(line, crawler_stats) if '"status": "4' in line or '"status": "5' in line: crawler_stats['errors'].append(self.parse_error(line)) return crawler_stats

// Crawler Performance Monitor class CrawlerPerformance { constructor () { this .metrics = { crawlRate: 0 , responseTime: [], errors: [], crawlDepth: new Map () }; } trackRequest ( url , response ) { this .metrics.crawlRate ++ ; this .metrics.responseTime. push (response.timing); this . updateCrawlDepth (url); if (response.status >= 400 ) { this .metrics.errors. push ({ url, status: response.status, timestamp: new Date () }); } } generateReport () { return { averageResponseTime: this . calculateAverageResponse (), errorRate: this . calculateErrorRate (), crawlDistribution: this . analyzeCrawlPattern () }; } }

// Crawl Error Handler class CrawlErrorHandler { async handleError ( error ) { const solutions = { '404' : async ( url ) => { await this . checkRedirects (url); await this . updateSitemap (url); await this . notifyTeam (url, '404' ); }, '500' : async ( url ) => { await this . checkServerHealth (); await this . notifyTeam (url, '500' ); }, 'robots_blocked' : async ( url ) => { await this . checkRobotsRules (); await this . updateRobotsTxt (); } }; return solutions[error.type]?.(error.url); } }

// Rate Limiting Implementation class CrawlerRateLimit { private $redis; private $window = 60; // seconds private $limit = 10; // requests per window public function isAllowed($userAgent) { $key = "rate_limit:" . $userAgent; $current = $this->redis->get($key) ?? 0; if ($current >= $this->limit) { return false; } $this->redis->incr($key); $this->redis->expire($key, $this->window); return true; } }

Remember that proper crawler management is essential for effective SEO. Optimizing your site for crawlers while maintaining performance for users requires careful balance and regular monitoring.