# ===========================================================================
#  robots.txt — Bible Empire Kampala
#  Domain  : https://bibleempirekampala.ug/
#  Updated : 2026-03
#  Purpose : Maximise crawl budget for SEO-critical content, protect
#            private/utility paths, and control AI training scrapers.
#
#  DEPLOYMENT: Place this file at the root of your web server so it is
#  accessible at https://bibleempirekampala.ug/robots.txt
# ===========================================================================


# ---------------------------------------------------------------------------
#  SECTION 1 — PRIMARY SEARCH ENGINES
#  Full access to all public content. Crawl-delay intentionally omitted for
#  Google and Bing — letting them self-regulate produces faster indexing.
# ---------------------------------------------------------------------------

User-agent: Googlebot
Allow: /
Allow: /images/
Allow: /css/
Allow: /site.webmanifest
Disallow: /cdn-cgi/
Disallow: /icons/

User-agent: Googlebot-Image
# Explicitly allow all product & gallery images for Google Image Search.
# This is important — product images rank in image search and drive local traffic.
Allow: /images/

User-agent: Googlebot-Video
Allow: /

User-agent: Googlebot-News
Allow: /

User-agent: AdsBot-Google
Allow: /

User-agent: AdsBot-Google-Mobile
Allow: /

User-agent: Bingbot
Allow: /
Allow: /images/
Allow: /css/
Disallow: /cdn-cgi/
Disallow: /icons/
Crawl-delay: 5

User-agent: Slurp
# Yahoo Search (powered by Bing)
Allow: /
Disallow: /cdn-cgi/
Disallow: /icons/
Crawl-delay: 10

User-agent: DuckDuckBot
Allow: /
Disallow: /cdn-cgi/
Disallow: /icons/
Crawl-delay: 5

User-agent: Baiduspider
# Baidu — Chinese search engine, relevant for diaspora traffic
Allow: /
Disallow: /cdn-cgi/
Disallow: /icons/
Crawl-delay: 10

User-agent: YandexBot
# Yandex — Russian search engine
Allow: /
Disallow: /cdn-cgi/
Disallow: /icons/
Crawl-delay: 10


# ---------------------------------------------------------------------------
#  SECTION 2 — SOCIAL MEDIA PREVIEW CRAWLERS
#  These bots read Open Graph and Twitter Card meta tags to generate
#  link previews when someone shares bibleempirekampala.ug on social media.
#  Blocking them breaks WhatsApp, Facebook, and Instagram link previews.
# ---------------------------------------------------------------------------

User-agent: facebookexternalhit
# Facebook & Instagram link preview bot — reads og:image, og:title
Allow: /
Allow: /images/

User-agent: Twitterbot
# Twitter/X card preview bot
Allow: /
Allow: /images/

User-agent: LinkedInBot
Allow: /
Allow: /images/

User-agent: WhatsApp
# WhatsApp link preview — critical since the site uses wa.me CTAs
Allow: /
Allow: /images/

User-agent: TelegramBot
Allow: /
Allow: /images/

User-agent: Slackbot
Allow: /
Allow: /images/


# ---------------------------------------------------------------------------
#  SECTION 3 — SEO & ANALYTICS TOOLS
#  Allowing these lets Bible Empire track its own rankings, backlinks,
#  and competitor gaps in tools like Semrush and Ahrefs.
#  Remove or Disallow these if the client prefers not to be tracked
#  by third-party SEO tools.
# ---------------------------------------------------------------------------

User-agent: SemrushBot
Allow: /
Disallow: /cdn-cgi/
Disallow: /icons/
Crawl-delay: 10

User-agent: AhrefsBot
Allow: /
Disallow: /cdn-cgi/
Disallow: /icons/
Crawl-delay: 10

User-agent: MJ12bot
# Majestic SEO
Disallow: /
# Blocked — Majestic crawls very aggressively; no client benefit.

User-agent: DotBot
# Moz SEO crawler
Allow: /
Disallow: /cdn-cgi/
Disallow: /icons/
Crawl-delay: 10


# ---------------------------------------------------------------------------
#  SECTION 4 — AI TRAINING & LLM SCRAPERS
#  These bots harvest web content to train large language models.
#  Blocking them does NOT affect Google search rankings.
#  Recommended: Block to protect the client's product descriptions,
#  pricing, and copywriting from being used in AI training datasets.
# ---------------------------------------------------------------------------

User-agent: GPTBot
# OpenAI training scraper
Disallow: /

User-agent: ChatGPT-User
Disallow: /

User-agent: Google-Extended
# Google Gemini / Bard training scraper (separate from Googlebot)
Disallow: /

User-agent: anthropic-ai
# Anthropic Claude training scraper
Disallow: /

User-agent: ClaudeBot
Disallow: /

User-agent: Claude-Web
Disallow: /

User-agent: cohere-ai
Disallow: /

User-agent: PerplexityBot
Disallow: /

User-agent: YouBot
Disallow: /

User-agent: CCBot
# Common Crawl — large open dataset used by many AI companies
Disallow: /

User-agent: omgili
Disallow: /

User-agent: omgilibot
Disallow: /


# ---------------------------------------------------------------------------
#  SECTION 5 — MALICIOUS, SPAM & AGGRESSIVE BOTS
#  These crawlers scrape contact details, product prices, and content
#  for spam, price monitoring, or content theft. Block them all.
# ---------------------------------------------------------------------------

User-agent: SeznamBot
Disallow: /

User-agent: PetalBot
Disallow: /

User-agent: ia_archiver
# Alexa/Amazon archiver — no SEO benefit, high crawl cost
Disallow: /

User-agent: Exabot
Disallow: /

User-agent: sogou
Disallow: /

User-agent: Sogou web spider
Disallow: /

User-agent: proximic
Disallow: /

User-agent: GrapeshotCrawler
Disallow: /

User-agent: magpie-crawler
Disallow: /

User-agent: Screaming Frog SEO Spider
# Prevent competitors from scraping prices and content via SEO auditing tools
Disallow: /

User-agent: DataForSeoBot
Disallow: /

User-agent: BLEXBot
Disallow: /

User-agent: SurveyBot
Disallow: /

User-agent: VoilaBot
Disallow: /

User-agent: EmailCollector
Disallow: /

User-agent: EmailSiphon
Disallow: /

User-agent: ExtractorPro
Disallow: /

User-agent: WebCopier
Disallow: /

User-agent: HTTrack
Disallow: /

User-agent: Offline Explorer
Disallow: /


# ---------------------------------------------------------------------------
#  SECTION 6 — PATHS: ALWAYS BLOCK (applies to ALL bots not listed above)
#  These paths exist on the server but have zero SEO value and should
#  never be indexed. Blocking them preserves crawl budget for the pages
#  and content that actually rank and convert.
# ---------------------------------------------------------------------------

User-agent: *
# Block Cloudflare internal utilities (email obfuscation, challenge pages, etc.)
Disallow: /cdn-cgi/

# Block self-hosted FontAwesome icon library — pure CSS/font assets, no SEO value
Disallow: /icons/

# Block any accidental server/editor temp files
Disallow: /*.bak$
Disallow: /*.log$
Disallow: /*.sql$
Disallow: /*.env$
Disallow: /*.config$

# Block common CMS admin paths (defensive — in case the site ever migrates to WordPress, etc.)
Disallow: /wp-admin/
Disallow: /wp-login.php
Disallow: /wp-json/
Disallow: /wp-content/uploads/
Disallow: /admin/
Disallow: /administrator/
Disallow: /login/
Disallow: /dashboard/
Disallow: /cgi-bin/

# Allow all remaining public content
Allow: /


# ---------------------------------------------------------------------------
#  SECTION 7 — SITEMAP DECLARATION
#  Declare the sitemap location so every search engine crawler can find
#  and index all pages without waiting for a manual GSC submission.
#  Create this sitemap at: https://bibleempirekampala.ug/sitemap.xml
# ---------------------------------------------------------------------------

Sitemap: https://bibleempirekampala.ug/sitemap.xml