diff --git a/.github/actions/spelling/expect.txt b/.github/actions/spelling/expect.txt index 3f089dd..b355b87 100644 --- a/.github/actions/spelling/expect.txt +++ b/.github/actions/spelling/expect.txt @@ -85,6 +85,7 @@ goodbot googlebot govulncheck GPG +GPT grw Hashcash hashrate @@ -136,6 +137,7 @@ memes metrix mimi minica +mistralai Mojeek mojeekbot mozilla @@ -146,6 +148,7 @@ NONINFRINGEMENT nosleep ogtags onionservice +openai openrc pag parseable diff --git a/data/clients/mistral-mistralai-user.yaml b/data/clients/mistral-mistralai-user.yaml new file mode 100644 index 0000000..59b86fb --- /dev/null +++ b/data/clients/mistral-mistralai-user.yaml @@ -0,0 +1,10 @@ +# Acts on behalf of user requests +# https://docs.mistral.ai/robots/ +- name: mistral-mistralai-user + user_agent_regex: MistralAI-User/.+; \+https\://docs\.mistral\.ai/robots + action: ALLOW + # https://mistral.ai/mistralai-user-ips.json + remote_addresses: [ + "20.240.160.161/32", + "20.240.160.1/32", + ] \ No newline at end of file diff --git a/data/clients/openai-chatgpt-user.yaml b/data/clients/openai-chatgpt-user.yaml new file mode 100644 index 0000000..063cf08 --- /dev/null +++ b/data/clients/openai-chatgpt-user.yaml @@ -0,0 +1,93 @@ +# Acts on behalf of user requests +# https://platform.openai.com/docs/bots/overview-of-openai-crawlers +- name: openai-chatgpt-user + user_agent_regex: ChatGPT-User/.+; \+https\://openai\.com/bot + action: ALLOW + # https://openai.com/chatgpt-user.json + # curl 'https://openai.com/chatgpt-user.json' | jq '.prefixes.[].ipv4Prefix' | sed 's/$/,/' + remote_addresses: [ + "13.65.138.112/28", + "23.98.179.16/28", + "13.65.138.96/28", + "172.183.222.128/28", + "20.102.212.144/28", + "40.116.73.208/28", + "172.183.143.224/28", + "52.190.190.16/28", + "13.83.237.176/28", + "51.8.155.64/28", + "74.249.86.176/28", + "51.8.155.48/28", + "20.55.229.144/28", + "135.237.131.208/28", + "135.237.133.48/28", + "51.8.155.112/28", + "135.237.133.112/28", + "52.159.249.96/28", + "52.190.137.16/28", + "52.255.111.112/28", + "40.84.181.32/28", + "172.178.141.112/28", + "52.190.142.64/28", + "172.178.140.144/28", + "52.190.137.144/28", + "172.178.141.128/28", + "57.154.187.32/28", + "4.196.118.112/28", + "20.193.50.32/28", + "20.215.188.192/28", + "20.215.214.16/28", + "4.197.22.112/28", + "4.197.115.112/28", + "172.213.21.16/28", + "172.213.11.144/28", + "172.213.12.112/28", + "172.213.21.144/28", + "20.90.7.144/28", + "57.154.175.0/28", + "57.154.174.112/28", + "52.236.94.144/28", + "137.135.191.176/28", + "23.98.186.192/28", + "23.98.186.96/28", + "23.98.186.176/28", + "23.98.186.64/28", + "68.221.67.192/28", + "68.221.67.160/28", + "13.83.167.128/28", + "20.228.106.176/28", + "52.159.227.32/28", + "68.220.57.64/28", + "172.213.21.112/28", + "68.221.67.224/28", + "68.221.75.16/28", + "20.97.189.96/28", + "52.252.113.240/28", + "52.230.163.32/28", + "172.212.159.64/28", + "52.255.111.80/28", + "52.255.111.0/28", + "4.151.241.240/28", + "52.255.111.32/28", + "52.255.111.48/28", + "52.255.111.16/28", + "52.230.164.176/28", + "52.176.139.176/28", + "52.173.234.16/28", + "4.151.71.176/28", + "4.151.119.48/28", + "52.255.109.112/28", + "52.255.109.80/28", + "20.161.75.208/28", + "68.154.28.96/28", + "52.255.109.128/28", + "52.225.75.208/28", + "52.190.139.48/28", + "68.221.67.240/28", + "52.156.77.144/28", + "52.148.129.32/28", + "40.84.221.208/28", + "104.210.139.224/28", + "40.84.221.224/28", + "104.210.139.192/28", + ] \ No newline at end of file diff --git a/data/crawlers/openai-gptbot.yaml b/data/crawlers/openai-gptbot.yaml new file mode 100644 index 0000000..42658e7 --- /dev/null +++ b/data/crawlers/openai-gptbot.yaml @@ -0,0 +1,16 @@ +# Collects AI training data +# https://platform.openai.com/docs/bots/overview-of-openai-crawlers +- name: openai-gptbot + user_agent_regex: GPTBot/1\.1; \+https\://openai\.com/gptbot + action: ALLOW + # https://openai.com/gptbot.json + remote_addresses: [ + "52.230.152.0/24", + "20.171.206.0/24", + "20.171.207.0/24", + "4.227.36.0/25", + "20.125.66.80/28", + "172.182.204.0/24", + "172.182.214.0/24", + "172.182.215.0/24", + ] \ No newline at end of file diff --git a/data/crawlers/openai-searchbot.yaml b/data/crawlers/openai-searchbot.yaml new file mode 100644 index 0000000..05796be --- /dev/null +++ b/data/crawlers/openai-searchbot.yaml @@ -0,0 +1,13 @@ +# Indexing for search, does not collect training data +# https://platform.openai.com/docs/bots/overview-of-openai-crawlers +- name: openai-searchbot + user_agent_regex: OAI-SearchBot/1\.0; \+https\://openai\.com/searchbot + action: ALLOW + # https://openai.com/searchbot.json + remote_addresses: [ + "20.42.10.176/28", + "172.203.190.128/28", + "104.210.140.128/28", + "51.8.102.0/24", + "135.234.64.0/24" + ] \ No newline at end of file