RAK 4630 Join Timeout

We’re trying to get the basic RUI3 LoRaWAN_OTAA example working on a custom PCB, and we’re getting some strange behavior. We’ve updated the band, the channel mask, as well as setting the appropriate keys and EUIs, per the docs. When we run the example, we never see any of the join attempts reach The Things Network. What we do see is this error returned after about 4-5 seconds:

13:48:32.595 → Wait for LoRaWAN join…+EVT:JOIN_FAILED_TX_TIMEOUT

A search for the root cause of that error message is coming up empty, other than one thread here (different module, but on RUI3):

Here’s the RUI version running on this device:

AT+VER=RUI_3.5.3_RAK4631

The module is functioning, and I can get it to join using the AT commands:

AT+NWM=1
AT+APPEUI=xxxxxxx
AT+DEVEUI=xxxxxxxx
AT+APPKEY=xxxxxxxxxxxxxx
AT+NJM=1
AT+BAND=5
AT+MASK=0002
AT+JOIN=1:0:8:20

Any pointers on what to check to figure out why it works via the AT commands but the RUI3 examples get a timeout error? The only changes we have made to the example is to insert our keys and IDs, and to set the mask to the required value.

Thanks for any help you can provide!

Hi @nmcminn ,

I tested my RAK4631-R here and it seems to work fine.

However, I have experience TIMEOUT error before and I was able to fix it by registering another device and not reusing any pre-existing device registered to TTN.

You can probably try to add a new device with new set of OTAA parameters then upload your firmware.

Here’s my simple code for your reference:


#define OTAA_PERIOD   (20000)
/*************************************

   LoRaWAN band setting:
     RAK_REGION_EU433
     RAK_REGION_CN470
     RAK_REGION_RU864
     RAK_REGION_IN865
     RAK_REGION_EU868
     RAK_REGION_US915
     RAK_REGION_AU915
     RAK_REGION_KR920
     RAK_REGION_AS923

 *************************************/
#define OTAA_BAND     (RAK_REGION_US915)
#define OTAA_DEVEUI   {0x70, 0xB3, 0xD5, 0x7E, 0xD0, 0x04, 0x85, 0x26}
#define OTAA_APPEUI   {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}
#define OTAA_APPKEY   {0xDB, 0x54, 0xF1, 0xBB, 0xFE, 0x15, 0x7F, 0x92, 0xC2, 0x01, 0x84, 0xA6, 0x8D, 0x92, 0x79, 0x03}

/** Packet buffer for sending */
uint8_t collected_data[64] = { 0 };
uint16_t maskBuff = 0x0002;

void recvCallback(SERVICE_LORA_RECEIVE_T * data)
{
  if (data->BufferSize > 0) {
    Serial.println("Something received!");
    for (int i = 0; i < data->BufferSize; i++) {
      Serial.printf("%x", data->Buffer[i]);
    }
    Serial.print("\r\n");
  }
}

void joinCallback(int32_t status)
{
  Serial.printf("Join status: %d\r\n", status);
}

void sendCallback(int32_t status)
{
  if (status == 0) {
    Serial.println("Successfully sent");
  } else {
    Serial.println("Sending failed");
  }
}

void setup()
{
  Serial.begin(115200, RAK_AT_MODE);

  Serial.println("RAKwireless LoRaWan OTAA Example");
  Serial.println("------------------------------------------------------");

  // OTAA Device EUI MSB first
  uint8_t node_device_eui[8] = OTAA_DEVEUI;
  // OTAA Application EUI MSB first
  uint8_t node_app_eui[8] = OTAA_APPEUI;
  // OTAA Application Key MSB first
  uint8_t node_app_key[16] = OTAA_APPKEY;

  if (!api.lorawan.appeui.set(node_app_eui, 8)) {
    Serial.printf("LoRaWan OTAA - set application EUI is incorrect! \r\n");
    return;
  }
  if (!api.lorawan.appkey.set(node_app_key, 16)) {
    Serial.printf("LoRaWan OTAA - set application key is incorrect! \r\n");
    return;
  }
  if (!api.lorawan.deui.set(node_device_eui, 8)) {
    Serial.printf("LoRaWan OTAA - set device EUI is incorrect! \r\n");
    return;
  }

  if (!api.lorawan.band.set(OTAA_BAND)) {
    Serial.printf("LoRaWan OTAA - set band is incorrect! \r\n");
    return;
  }
  if (!api.lorawan.deviceClass.set(RAK_LORA_CLASS_A)) {
    Serial.printf("LoRaWan OTAA - set device class is incorrect! \r\n");
    return;
  }
  if (!api.lorawan.njm.set(RAK_LORA_OTAA))	// Set the network join mode to OTAA
  {
    Serial.
    printf("LoRaWan OTAA - set network join mode is incorrect! \r\n");
    return;
  }
  if (!api.lorawan.join())	// Join to Gateway
  {
    Serial.printf("LoRaWan OTAA - join fail! \r\n");
    return;
  }

  /** Wait for Join success */
  while (api.lorawan.njs.get() == 0) {
    Serial.print("Wait for LoRaWAN join...");
    api.lorawan.join();
    delay(10000);
  }

  Serial.printf("Set channel mask %s\r\n", api.lorawan.mask.set(&maskBuff) ? "Success" : "Fail");

  if (!api.lorawan.adr.set(true)) {
    Serial.printf
    ("LoRaWan OTAA - set adaptive data rate is incorrect! \r\n");
    return;
  }
  if (!api.lorawan.rety.set(1)) {
    Serial.printf("LoRaWan OTAA - set retry times is incorrect! \r\n");
    return;
  }
  if (!api.lorawan.cfm.set(1)) {
    Serial.printf("LoRaWan OTAA - set confirm mode is incorrect! \r\n");
    return;
  }

  /** Check LoRaWan Status*/
  Serial.printf("Duty cycle is %s\r\n", api.lorawan.dcs.get() ? "ON" : "OFF");	// Check Duty Cycle status
  Serial.printf("Packet is %s\r\n", api.lorawan.cfm.get() ? "CONFIRMED" : "UNCONFIRMED");	// Check Confirm status
  uint8_t assigned_dev_addr[4] = { 0 };
  api.lorawan.daddr.get(assigned_dev_addr, 4);
  Serial.printf("Device Address is %02X%02X%02X%02X\r\n", assigned_dev_addr[0], assigned_dev_addr[1], assigned_dev_addr[2], assigned_dev_addr[3]);	// Check Device Address
  Serial.printf("Uplink period is %ums\r\n", OTAA_PERIOD);
  Serial.println("");
  api.lorawan.registerRecvCallback(recvCallback);
  api.lorawan.registerJoinCallback(joinCallback);
  api.lorawan.registerSendCallback(sendCallback);
}

void uplink_routine()
{
  /** Payload of Uplink */
  uint8_t data_len = 0;
  collected_data[data_len++] = (uint8_t) 't';
  collected_data[data_len++] = (uint8_t) 'e';
  collected_data[data_len++] = (uint8_t) 's';
  collected_data[data_len++] = (uint8_t) 't';

  Serial.println("Data Packet:");
  for (int i = 0; i < data_len; i++) {
    Serial.printf("0x%02X ", collected_data[i]);
  }
  Serial.println("");

  /** Send the data package */
  if (api.lorawan.send(data_len, (uint8_t *) & collected_data, 2, true, 1)) {
    Serial.println("Sending is requested");
  } else {
    Serial.println("Sending failed");
  }
}

void loop()
{
  static uint64_t last = 0;
  static uint64_t elapsed;

  if ((elapsed = millis() - last) > OTAA_PERIOD) {
    uplink_routine();

    last = millis();
  }
  //Serial.printf("Try sleep %ums..", OTAA_PERIOD);
  api.system.sleep.all(OTAA_PERIOD);
  //Serial.println("Wakeup..");
}

Thank you @carlrowan. I just copied your test code (which was virtually identical to what I was using).

I updated the EUIs and keys with a fresh set created on The Things Network, and ran the sketch. Unfortunately I’m getting exactly the same outcome:

09:29:24.581 → Wait for LoRaWAN join…+EVT:JOIN_FAILED_TX_TIMEOUT
09:29:34.580 → Wait for LoRaWAN join…+EVT:JOIN_FAILED_TX_TIMEOUT
09:29:44.873 → Wait for LoRaWAN join…+EVT:JOIN_FAILED_TX_TIMEOUT

I don’t see any traffic at all on the gateway, which leads me to think that no transmit is actually happening. Can you help me understand what “JOIN_FAILED_TX_TIMEOUT” actually means? Does that mean that the RAK4630 is timing out while attempting to transmit? If that is the case, what could be blocking it?

We have previously used this board to test LoRaWAN joins with the AT commands to make sure the PCB is functioning correctly. Could those tests have put the radio into a state where this sketch won’t run properly?

Just to be on the safe side, I have also reflashed the device with the latest RUI3 firmware. There were no errors during the flash, and I confirmed the version:

10:04:52.483 → AT+VER=RUI_3.5.3_RAK4631

@carlrowan and @beegee do you happen to have any internal documentation that can tell us exactly what the “JOIN_FAILED_TX_TIMEOUT” code actually means? RUI3 is closed source, so I can’t find out for myself, unfortunately. We’re blocked on this issue.

Thank you for any help you can provide

Hi @nmcminn ,

You have a very strange issue since AT commands work and you are only trying the basic OTAA code.

Maybe you can try to use a lower RAK4631-R BSP (3.4.2) and see if the behavior will be the same?

Hi @carlrowan we’ve checked this with RUI 3.5.3, 3.4.2 and 3.2.0, and we see the same behavior across all three versions of RUI3.

What does the message “JOIN_FAILED_TX_TIMEOUT” code actually mean? Does it mean the LoRaWAN radio is busy? Something else?

Hi @nmcminn ,

Really confusing issue :cry:

On AT command, it is working as you said in the earlier message.

This means that the Gateway and LNS (LoRaWAN Network Server) are ok. Also means the RF path are ok (I am not sure though what RSSI and SNR levels you get on AT command test).

On the other hand, if firmware related, it should be repeatable (ideally). Unless there is prior sequence happened on the module that could potential cause the issue. If possible, you can also attempt to upload a completely new firmware (bootloader+application code) using RAKDAP1 or Jlink. Also, do you have other modules to test? Are they doing the same behavior?

I have to check what exactly JOIN_FAILED_TX_TIMEOUT means. I can only assume that it is still on the process of sending uplink. I’ll ask the software team about it.

Hi @carlrowan

This is an easily repeatable issue, we have two prototype PCBs here, both of which exhibit the same behavior. I was also getting the same result with a Wisblock 4631 module, until I flashed the bootloader to rui3_nrf52840_bootloader_latest, and also applied rui3_rak4631_latest, both found here:

Once we reflashed the 4631 test device with the new bootloader, it worked.

I realize that this is the bootloader for 3.4.2. I performed the same procedure on our custom PCB, and it is now also successfully joining and sending data via LoRaWAN. Here’s the current output from AT+VER:

AT+VER=3.4.2-rui3_22q1_update.112

I’m a bit nervous, as I don’t know exactly why this worked, but it appears that a full downgrade to 3.4.2 in the BSP, and flashing the device via BLE OTA, has fixed this particular issue.

I’m also trying to figure out what has changed from 3.4.2 to 3.5.3, but according to the Github releases 3.4.2 is the latest:

Is there a changelog from 3.4.2 → 3.5.3 that I’m missing?

Hi @nmcminn ,

As for the change logs, you can find more details here - https://downloads.rakwireless.com/RUI/RUI3/Image/CHANGELOG.md

For the BSP, this is the correct link - GitHub - RAKWireless/RAKwireless-Arduino-BSP-Index: RAKwireless BSP Support for the Arduino Board Manager

To view the changes on different versions on BSP, you can also look the staging and check the particular version GitHub - RAKWireless/RAKwireless-Arduino-BSP-Index at staging

Soon we will release 3.5.4 (doing final verification) then a major one 4.0.0. These are more stable versions. I will try and verify what you experience. Btw, for the update of firmware, all you use is nrfutil correct? You do not use any external tools like jlink or rakdap1?

@carlrowan, it looks like that change log link is dead, but the BSP staging link you sent is fine.

I’ve done some further testing with 3.4.2, and it looks like it only works very intermittently, and even then only for a short time.

I’ve built LoRaWAN firmware for many different devices, including ESP32’s with a LoRa radio on the board, the Murata Type ABZ module, Pycom devices, and a few others. In every case I have looked at, the sub band within the region is set before the join is initiated. This is also true of joining the network with that AT command example that I provided in the opening of this thread.

However, in the code example you posted, the channel mask is only set after the join is successful. Could this be the cause of this problem? I’ve tried moving the line of code that sets the mask to before the join is initiated, but setting the mask there returns an error.

EDIT: As a test, I moved the setting of the channel mask to before the first join call, with no change in results. Every time we try to get this thing to join, we get the same +EVT:JOIN_FAILED_TX_TIMEOUT
error message.

Is there any additional debug output we can turn on, or anything to further help troubleshoot this issue? We’ve been struggling with this for weeks.

Hi @nmcminn ,

You are correct that the mask should be set first before joining. Could be that the mask is already configured on my test.

For the changelog, you can check this temporarily - Dropbox - CHANGELOG.md - Simplify your life

Can you give me more details on the module?

  1. What is the initial firmware on the module?
  2. How did you flash the these firmware? Using Jlink/DAP tool? Or via nrfutil?
  1. For BLE OTA, did you use nrfconnect or wistoolbox?

Btw, for the +EVT:JOIN_FAILED_TX_TIMEOUT, R&D informed me the following reason.

  • The LoRa module is sleeping
  • User do abnormal operation while device try to JOIN
  • The RAK4631 interrupt is abnormal

Still not directly useful list since it is very generic.

I will replicate all your steps and see if I can get same result.

  1. When we received these modules, the firmware running on them was the default 3.5.3.
  2. I have reflashed the device multiple times, using nrfutil.
  3. When updating back to the factory firmware using BLE OTA, I used nrfconnect.

It might be worth noting that we used these modules for our CE / FCC testing procedures. Our test lab used several of the test modes built into the default firmware to perform these tests (continuous wave, LoRaWAN certification, tone test, etc. Could one of those modes have left the module in a state that could cause this problem?

Is there a way to fully reset the module back to factory, removing all configuration and any state stored in the device?

@ @nmcminn

Answering here both (your post here and your email)

AT+Z ATR will reset the device to factory defaults.

Inside RUI3, the AT command AT+JOIN and the API call api.lorawan.join() are the same code on a lower level.

What is AT+JOIN=? returning? If automatic join is enabled through AT command (first two parameters == 1), try to disable it with AT+JOIN=0:0 and check whether the API call from you code works different.

If you set automatic joining with AT command (AT+JOIN=1:1) and remove the API call api.lorawan.join() from your code, is it able to join?

I am asking this, because using AT+JOIN=1:1 and calling api.lorawan.join() from the code could lead to a runtime situation. I still have to check, but I am guessing that the device would try to automatically join on startup (because of AT+JOIN=1:1), but when you call api.lorawan.join() from your code an ongoing join sequence would be cancelled and the join would be restarted.

If you do not have a specific reason to call api.lorawan.join() from your code, using AT+JOIN=1:1 should be the better solution. You can check the status of the join request with api.lorawan.njs.get():

    while (api.lorawan.njs.get() == 0)
    {
      Serial.print("Waiting for Lorawan join...");
      delay(10000);
    }
1 Like

@beegee, still no luck here. It looks like AT+Z is not a valid command. ATZ (without the +) will reset the MCU, but doesn’t change any settings. According to the AT command set docs ATR is the command to restore factory defaults. There are a few commands listed in the firmware AT? output that aren’t in the docs, but that’s a separate issue.

Here are the steps I went through (Using RUI 3.5.3 BSP).

  1. Loaded an “empty” sketch via the Arduino that simply has a 5 second delay in the “loop” function.
  2. Using the AT commands, I restored the module to factory defaults (ATR).
  3. I ran AT+JOIN=? to get the join settings. AT+JOIN returned 0:0:8:0. This should mean not joining, no auto join on power up, 8 second delay between attempts, and zero retry attempts, correct?

Once that was done, I loaded the sketch that should join the LoRaWAN network, a very slightly modified version of the code @carlrowan provided above.

When that code starts up, we saw the same result (join failure, with EVT:JOIN_FAILED_TX_TIMEOUT). If I run the AT+JOIN=? command again, while the LoRaWAN sketch is running, we get a result of AT+JOIN=1:0:8:0, which should mean that the device is trying to join, is not set up for automatic join on power on, 8 seconds between join retries, and zero retries.

Next, I loaded the “empty” sketch again so that I could reconfigure the device’s join behavior using only AT commands:

  1. ATR to restore factory settings
  2. AT+JOIN=1:1:8:8, join, join on power up, 8 seconds between retries, 8 join attempts.

I removed all of the api.lorawan.join() calls from our LoRaWAN sketch, but left in the calls to do things like set the channel mask, etc. When we do this, the first call to set up the LoRaWAN settings (a call to set the app EUI) fails. I’m assuming that this is because the device is already joining due to the join on power up behavior, and things like EUIs cannot be changed while the device is attempting to join.

Next, I decided to test and see if the join with only AT commands still works. I used nrfconnect to flash the factory RUI 3.5.3 firmware package (RUI_3.5.3_RAK4631_dfu_package.zip). I ran ATR to reset to defaults, and then used the exact same EUIs and keys we used the first time, and the same commands and sequence defined in the opening message of this thread. Now the module will not join at all, even using just the AT command set. Here’s the console output.

09:03:37.126 → +EVT:JOIN_FAILED_TX_TIMEOUT
09:03:57.308 → +EVT:JOIN_FAILED_TX_TIMEOUT
09:04:09.316 → +EVT:JOIN_FAILED_TX_TIMEOUT
09:04:21.303 → +EVT:JOIN_FAILED_TX_TIMEOUT
09:04:33.320 → +EVT:JOIN_FAILED_TX_TIMEOUT

I did a little hunting around on Github, and found a DFU package for 3.5.4 (RUI_3.5.4_RAK4631_dfu_package.zip) and decided to give it a try. I flashed it to the device using nrfconnect, then ran ATR to restore factory defaults. Then I used the same AT commands to set up the LoRaWAN connection. I was able to get it to join twice, but when I tried to reproduce the results it was back to the timeout error

To summarize we have gotten the device to join the network via code once or twice, and via the AT commands once or twice, but very inconsistently.

What other options do we have here?

Hi Nathan,

From your screenshot, I guess you are using TTN as LoRaWAN server.
do you see any other messages after the two join-request and join-accept in the log?

Not sure if I asked already, but do you see Join Request messages in the gateways packet log screen when it fails to join?

Good morning @beegee.

When we see the +EVT:JOIN_FAILED_TX_TIMEOUT on the device, we do not see any messages received by the gateway, or by the LoRaWAN server. This error is preventing the device from transmitting anything, as best I can tell.

This is definitely a problem on the device itself.

I tried earlier today to connect a RAK4631 with RUI3 (V3.5.4, same as you) to my US915 Chirpstack server (on a VPN in Germany) and I didn’t have any serious problems, beside of 1 or two failed retries.

It worked with both AT command and API call to join.

Your problem is really strange.
Our R&D team is out on Chinese New Year holidays already, so I will not get any help from them for a week.

Would you be willing to run a test against my US915 server?

I can definitely do that, yes! I have a gateway here that I can set up to point to whatever endpoint you think would work. However, I don’t think we will see different results as I don’t see any messages ever hitting the gateway at all.

@beegee and @carlrowan I think I figured it out. Here’s what happened:

I was doing some further testing this morning. I flashed RUI 3.4.2 to the device using nrfconnect. Here’s the output from AT+VER:

AT+VER=3.4.2-rui3_22q1_update.112

Once the device restarted, I ran the same AT commands, and the device was able to join the network:

Next, I reset the device with ATZ, and asked it to join again. This time, it failed to join with the same error message, +EVT:JOIN_FAILED_TX_TIMEOUT

If I first run ATR to restore factory settings, then ATZ to restart the MCU, then flash 3.4.2-rui3_22q1_update.112 to the device using nrfconnect and then finally run the join sequence AT commands, it will usually join, and be able to join again after powering the device off and then back on.

This was encouraging, so I ran a few additional tests. The first one was to load the default 3.5.3 firmware using nrfconnect. Then, I configured the device to join using the AT commands. The device failed to join. I powered the device off, and then back on, and waited a few minutes before issuing the join command. This time, it worked. Curious. I played around a little with waiting different intervals before issuing the LoRaWAN AT commands to join the network, and found after 30-60 seconds it would work consistently. This is the same amount of time that the BLE radio is on when the device first starts up, as confirmed by using a Bluetooth scanner. I added the following two lines of code to the setup() function of the test sketch:

  // turn off BLE UART
  api.ble.uart.stop();
  // turn off BLE advertising
  api.ble.advertise.stop();

With these two lines included in the code, the LoRaWAN join and subsequent message sends work exactly as expected.

It looks like the LoRaWAN radio can’t be used until the Bluetooth is shut off. Is there some kind of software interlock to prevent the two radios from being used at the same time? Or, since the R&D team said that this could be interrupt related, could the BLE radio be driving some interrupts that knock out the LoRaWAN radio?

Hi Nathan,

I am not disabling BLE on my device and don’t have this problem. The BLE and LoRa drivers are independent and there should be no interlock between the two.

The RAK4630 is on a custom PCB? If yes, can you show me the power-supply connections of the RAK4630 module?
On our WisBlock RAK4631 Core module the power supply is like this:
image