1
0
Fork 0
mirror of https://git.sr.ht/~seirdy/seirdy.one synced 2024-11-27 14:12:09 +00:00

More robots.txt exclusions

For shitty services that at least respect robots.txt
This commit is contained in:
Rohan Kumar 2023-07-24 15:33:02 -07:00
parent 48f1e3873e
commit 287a0a5dc0
No known key found for this signature in database
GPG key ID: 1E892DB2A5F84479

View file

@ -3,10 +3,14 @@ Disallow: /noindex/
Disallow: /misc/ Disallow: /misc/
# I opt out of online advertising so malware that injects ads on my site won't get paid. # I opt out of online advertising so malware that injects ads on my site won't get paid.
# You should do the same. # You should do the same. my ads.txt file contains a standard placeholder to forbid any
# compliant ad networks from paying for ad placement on my domain.
User-Agent: Adsbot User-Agent: Adsbot
Disallow: / Disallow: /
Allow: /ads.txt Allow: /ads.txt
Allow: /app-ads.txt
# The next three are borrowed from https://www.videolan.org/robots.txt
# > This robot collects content from the Internet for the sole purpose of # helping educational institutions prevent plagiarism. [...] we compare student papers against the content we find on the Internet to see if we # can find similarities. (http://www.turnitin.com/robot/crawlerinfo.html) # > This robot collects content from the Internet for the sole purpose of # helping educational institutions prevent plagiarism. [...] we compare student papers against the content we find on the Internet to see if we # can find similarities. (http://www.turnitin.com/robot/crawlerinfo.html)
# --> fuck off. # --> fuck off.
@ -28,6 +32,12 @@ Disallow: /
User-Agent: BLEXBot User-Agent: BLEXBot
Disallow: / Disallow: /
# Providing Intellectual Property professionals with superior brand protection services by artfully merging the latest technology with expert analysis. (https://www.checkmarknetwork.com/spider.html/)
# "The Internet is just way to big to effectively police alone." (ACTUAL quote)
# --> fuck off.
User-agent: CheckMarkNetwork/1.0 (+https://www.checkmarknetwork.com/spider.html)
Disallow: /
# Eat shit, OpenAI. # Eat shit, OpenAI.
User-agent: ChatGPT-User User-agent: ChatGPT-User
Disallow: / Disallow: /