From 287a0a5dc0194452c8abcbffd2b9e53b9619d5eb Mon Sep 17 00:00:00 2001 From: Rohan Kumar Date: Mon, 24 Jul 2023 15:33:02 -0700 Subject: [PATCH] More robots.txt exclusions For shitty services that at least respect robots.txt --- static/robots.txt | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/static/robots.txt b/static/robots.txt index 788825d..f9e3759 100644 --- a/static/robots.txt +++ b/static/robots.txt @@ -3,10 +3,14 @@ Disallow: /noindex/ Disallow: /misc/ # I opt out of online advertising so malware that injects ads on my site won't get paid. -# You should do the same. +# You should do the same. my ads.txt file contains a standard placeholder to forbid any +# compliant ad networks from paying for ad placement on my domain. User-Agent: Adsbot Disallow: / Allow: /ads.txt +Allow: /app-ads.txt + +# The next three are borrowed from https://www.videolan.org/robots.txt # > This robot collects content from the Internet for the sole purpose of # helping educational institutions prevent plagiarism. [...] we compare student papers against the content we find on the Internet to see if we # can find similarities. (http://www.turnitin.com/robot/crawlerinfo.html) # --> fuck off. @@ -28,6 +32,12 @@ Disallow: / User-Agent: BLEXBot Disallow: / +# Providing Intellectual Property professionals with superior brand protection services by artfully merging the latest technology with expert analysis. (https://www.checkmarknetwork.com/spider.html/) +# "The Internet is just way to big to effectively police alone." (ACTUAL quote) +# --> fuck off. +User-agent: CheckMarkNetwork/1.0 (+https://www.checkmarknetwork.com/spider.html) +Disallow: / + # Eat shit, OpenAI. User-agent: ChatGPT-User Disallow: /