From 1701c4b2541ab078a97884f315f7b3adba9772af Mon Sep 17 00:00:00 2001 From: Seirdy Date: Thu, 8 Aug 2024 02:21:00 -0400 Subject: [PATCH] Slow down MJ12bot --- static/robots.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/static/robots.txt b/static/robots.txt index 0998ff5..5b1d5aa 100644 --- a/static/robots.txt +++ b/static/robots.txt @@ -87,6 +87,10 @@ Disallow: / User-agent: PiplBot Disallow: / +# Well-known overly-aggressive bot that claims to respect robots.txt: http://mj12bot.com/ +User-agent: MJ12bot +Crawl-Delay: 10 + ## Gen-AI data scrapers ## # Eat shit, OpenAI. @@ -117,6 +121,9 @@ User-Agent: FacebookBot User-Agent: meta-externalagent Disallow: / +# This one doesn't support robots.txt: https://www.allenai.org/crawler +# block it with your reverse-proxy or WAF or something. + # I'm not blocking CCBot for now. It publishes a free index for anyone to use. # Googe used this to train the initial version of Bard (now called Gemini). # I allow CCBot since its index is also used for upstart/hobbyist search engines