1
0
Fork 0
mirror of https://git.sr.ht/~seirdy/seirdy.one synced 2024-11-23 21:02:09 +00:00
seirdy.one/static/robots.txt
2024-10-23 00:09:46 -04:00

152 lines
6.2 KiB
Text
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

User-agent: *
Disallow: /noindex/
Disallow: /misc/
# I opt out of online advertising so malware that injects ads on my site won't
# get paid. You should do the same. my ads.txt file contains a standard
# placeholder to forbid any compliant ad networks from paying for ad placement
# on my domain.
User-Agent: Adsbot
Disallow: /
Allow: /ads.txt
Allow: /app-ads.txt
# By allowing us access, you enable the maximum number
# of advertisers to confidently purchase advertising space on your pages. Our
# comprehensive data insights help advertisers understand the suitability and
# context of your content, ensuring that their ads align with your audience's
# interests and needs. This alignment leads to improved user experiences,
# increased engagement, and ultimately, higher revenue potential for your
# publication. (https://www.peer39.com/crawler-notice)
# --> fuck off.
User-agent: peer39_crawler
User-Agent: peer39_crawler/1.0
Disallow: /
## IP-violation scanners ##
# The next three are borrowed from https://www.videolan.org/robots.txt
# > This robot collects content from the Internet for the sole purpose of #
# helping educational institutions prevent plagiarism. [...] we compare student
# papers against the content we find on the Internet to see if we # can find
# similarities. (http://www.turnitin.com/robot/crawlerinfo.html)
# --> fuck off.
User-Agent: TurnitinBot
Disallow: /
# > NameProtect engages in crawling activity in search of a wide range of brand
# and other intellectual property violations that may be of interest to our
# clients. (http://www.nameprotect.com/botinfo.html)
# --> fuck off.
User-Agent: NPBot
Disallow: /
# iThenticate is a new service we have developed to combat the piracy of
# intellectual property and ensure the originality of written work for#
# publishers, non-profit agencies, corporations, and newspapers.
# (http://www.slysearch.com/)
# --> fuck off.
User-Agent: SlySearch
Disallow: /
# BLEXBot assists internet marketers to get information on the link structure
# of sites and their interlinking on the web, to avoid any technical and
# possible legal issues and improve overall online experience.
# (http://webmeup-crawler.com/)
# --> fuck off.
User-Agent: BLEXBot
Disallow: /
# Providing Intellectual Property professionals with superior brand protection
# services by artfully merging the latest technology with expert analysis.
# (https://www.checkmarknetwork.com/spider.html/)
# "The Internet is just way to big to effectively police alone." (ACTUAL quote)
# --> fuck off.
User-agent: CheckMarkNetwork/1.0 (+https://www.checkmarknetwork.com/spider.html)
Disallow: /
# Stop trademark violations and affiliate non-compliance in paid search.
# Automatically monitor your partner and affiliates online marketing to
# protect yourself from harmful brand violations and regulatory risks. We
# regularly crawl websites on behalf of our clients to ensure content
# compliance with brand and regulatory guidelines.
# (https://www.brandverity.com/why-is-brandverity-visiting-me)
# --> fuck off.
User-agent: BrandVerity/1.0
Disallow: /
## Misc. icky stuff ##
# Pipl assembles online identity information from multiple independent sources
# to create the most complete picture of a digital identity and connect it to
# real people and their offline identity records. When all the fragments of
# online identity data are collected, connected, and corroborated, the result
# is a more trustworthy identity.
# --> fuck off.
User-agent: PiplBot
Disallow: /
# Well-known overly-aggressive bot that claims to respect robots.txt: http://mj12bot.com/
User-agent: MJ12bot
Crawl-Delay: 10
## Gen-AI data scrapers ##
# Eat shit, OpenAI.
User-agent: ChatGPT-User
User-agent: GPTBot
Disallow: /
# Official way to opt-out of Google's generative AI training:
# <https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers>
User-agent: Google-Extended
Disallow: /
# Official way to opt-out of LLM training by Apple
# <https://support.apple.com/en-us/119829#datausage>
User-agent: Applebot-Extended
Disallow: /
# Anthropic-AI crawler posted guidance after a long period of crawling without opt-out documentation: <https://support.anthropic.com/en/articles/8896518-does-anthropic-crawl-data-from-the-web-and-how-can-site-owners-block-the-crawler>
User-agent: ClaudeBot
Disallow: /
# FacebookBot crawls public web pages to improve language models for our speech
# recognition technology.
# <https://developers.facebook.com/docs/sharing/bot/?_fb_noscript=1>
# UPDATE: The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.
# <https://developers.facebook.com/docs/sharing/webmasters/web-crawlers>
User-Agent: FacebookBot
User-Agent: meta-externalagent
Disallow: /
# This one doesn't support robots.txt: https://www.allenai.org/crawler
# block it with your reverse-proxy or WAF or something.
# See <https://ds.rois.ac.jp/center8/crawler/>
# Parent page says it builds LLMs in the infographic: <https://ds.rois.ac.jp/center8/>
User-agent: Cotoyogi
Disallow: /
# https://webz.io/bot.html
User-agent: Webzio-extended
Disallow: /
# I'm not blocking CCBot for now. It publishes a free index for anyone to use.
# Googe used this to train the initial version of Bard (now called Gemini).
# I allow CCBot since its index is also used for upstart/hobbyist search engines
# like Alexandria and for genuinely useful academic work I personally like.
# I allow Owler for similar reasons:
# <https://openwebsearch.eu/owler/#owler-opt-out>
# <https://openwebsearch.eu/common-goals-with-common-crawl/>.
# Omgilibot/Omgili is similar to CCBot, except it sells the scrape results.
# I'm not familiar enough with Omgili to make a call here.
# In the long run, my embedded robots meta-tags and headers could cover gen-AI
# I don't block cohere-ai or Perplexitybot: they don't appear to actually
# scrape data for LLM training purposes. The crawling powers search engines
# with integrated pre-trained LLMs.
# TODO: investigate whether YouBot scrapes to train its own in-house LLM.
Sitemap: https://seirdy.one/sitemap.xml